{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "AIBT63HBkXPU",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "AIBT63HBkXPU",
    "outputId": "e0bc1489-d9db-41fb-b67d-64ed4b54b2b6"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Requirement already satisfied: pymorphy2 in /usr/local/lib/python3.7/dist-packages (0.9.1)\n",
      "Requirement already satisfied: pymorphy2-dicts-ru<3.0,>=2.4 in /usr/local/lib/python3.7/dist-packages (from pymorphy2) (2.4.417127.4579844)\n",
      "Requirement already satisfied: dawg-python>=0.7.1 in /usr/local/lib/python3.7/dist-packages (from pymorphy2) (0.7.2)\n",
      "Requirement already satisfied: docopt>=0.6 in /usr/local/lib/python3.7/dist-packages (from pymorphy2) (0.6.2)\n"
     ]
    }
   ],
   "source": [
    "!pip install pymorphy2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d0b99024",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "d0b99024",
    "outputId": "e931e0b2-0e94-48aa-abf2-d08d21c6e182"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[nltk_data] Downloading package punkt to /root/nltk_data...\n",
      "[nltk_data]   Package punkt is already up-to-date!\n",
      "[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
      "[nltk_data]   Package stopwords is already up-to-date!\n",
      "Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n",
      "Populating the interactive namespace from numpy and matplotlib\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "\n",
    "import bz2\n",
    "import re\n",
    "from tqdm.notebook import tqdm\n",
    "from collections import Counter\n",
    "import pymorphy2\n",
    "\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import nltk\n",
    "nltk.download('punkt')\n",
    "nltk.download('stopwords')\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.metrics import f1_score, accuracy_score, confusion_matrix\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.decomposition import TruncatedSVD, LatentDirichletAllocation\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.pipeline import Pipeline\n",
    "from gensim.corpora import Dictionary\n",
    "from gensim.models import TfidfModel, lsimodel\n",
    "from gensim import similarities\n",
    "\n",
    "%matplotlib inline\n",
    "%pylab inline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6d6b7c65",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 49,
     "referenced_widgets": [
      "f38d3adb3e7c4797beec874066ccd8eb",
      "72d64d39a42146a19e1d373b6078b281",
      "652418a65f4e4fd8b9b2f60d8f5edda0",
      "a9ad1c47ba4744eaaad443059742db35",
      "2f70f53cb8b3495eb84b7463456a4340",
      "c95311ef48f4406095b304778a95aecb",
      "31fb0b358fa6469e852a4061f1a70ca2",
      "dbc753abddc8445991a86435910fbdae",
      "92016471275947f7af5c1ef166e86611",
      "d00ef25ddd2841f2ad7f71998ef5fe7f",
      "ce46f2c180d14c6bacdfd764dceef258"
     ]
    },
    "id": "6d6b7c65",
    "outputId": "eba06094-2885-4e5f-aca4-d5346a686f57"
   },
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "f38d3adb3e7c4797beec874066ccd8eb",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "0it [00:00, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "responses = []\n",
    "with bz2.BZ2File('banki_responses.json.bz2', 'r') as thefile:\n",
    "    for row in tqdm(thefile):\n",
    "        resp = json.loads(row)\n",
    "        if not resp['rating_not_checked'] and (len(resp['text'].split()) > 0):\n",
    "            responses.append(resp)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "93224305",
   "metadata": {
    "id": "93224305"
   },
   "source": [
    "# Домашнее задание по NLP # 1 [100 баллов]\n",
    "\n",
    "Классификация по тональности\n",
    "\n",
    "В этом домашнем задании необходимо классифицировать по тональности отзывы на банки с сайта banki.ru.\n",
    "\n",
    "Данные содержат непосредственно тексты отзывов, некоторую дополнительную информацию, а также оценку по шкале от 1 до 5. Тексты хранятся в json-ах в массиве responses."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "63c79a3e",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 206
    },
    "id": "63c79a3e",
    "outputId": "4e5c036c-725f-4d49-8e4e-8083082d7edc"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>city</th>\n",
       "      <th>rating_not_checked</th>\n",
       "      <th>title</th>\n",
       "      <th>num_comments</th>\n",
       "      <th>bank_license</th>\n",
       "      <th>author</th>\n",
       "      <th>bank_name</th>\n",
       "      <th>datetime</th>\n",
       "      <th>text</th>\n",
       "      <th>rating_grade</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>г. Москва</td>\n",
       "      <td>False</td>\n",
       "      <td>Жалоба</td>\n",
       "      <td>0</td>\n",
       "      <td>лицензия № 2562</td>\n",
       "      <td>uhnov1</td>\n",
       "      <td>Бинбанк</td>\n",
       "      <td>2015-06-08 12:50:54</td>\n",
       "      <td>Добрый день! Я не являюсь клиентом банка и пор...</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>г. Новосибирск</td>\n",
       "      <td>False</td>\n",
       "      <td>Не могу пользоваться услугой Сбербанк он-лайн</td>\n",
       "      <td>0</td>\n",
       "      <td>лицензия № 1481</td>\n",
       "      <td>Foryou</td>\n",
       "      <td>Сбербанк России</td>\n",
       "      <td>2015-06-08 11:09:57</td>\n",
       "      <td>Доброго дня! Являюсь держателем зарплатной кар...</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>г. Москва</td>\n",
       "      <td>False</td>\n",
       "      <td>Двойное списание за один товар.</td>\n",
       "      <td>1</td>\n",
       "      <td>лицензия № 2562</td>\n",
       "      <td>Vladimir84</td>\n",
       "      <td>Бинбанк</td>\n",
       "      <td>2015-06-05 20:14:28</td>\n",
       "      <td>Здравствуйте!  Дублирую свое заявление от 03.0...</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>г. Ставрополь</td>\n",
       "      <td>False</td>\n",
       "      <td>Меняют проценты комиссии  не предупредив и не ...</td>\n",
       "      <td>2</td>\n",
       "      <td>лицензия № 1481</td>\n",
       "      <td>643609</td>\n",
       "      <td>Сбербанк России</td>\n",
       "      <td>2015-06-05 13:51:01</td>\n",
       "      <td>Добрый день!! Я открыл расчетный счет в СберБа...</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>г. Челябинск</td>\n",
       "      <td>False</td>\n",
       "      <td>Верните денежные средства за страховку</td>\n",
       "      <td>1</td>\n",
       "      <td>лицензия № 2766</td>\n",
       "      <td>anfisa-2003</td>\n",
       "      <td>ОТП Банк</td>\n",
       "      <td>2015-06-05 10:58:12</td>\n",
       "      <td>04.03.2015 г. взяла кредит в вашем банке, заяв...</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "             city  ...  rating_grade\n",
       "0       г. Москва  ...           NaN\n",
       "1  г. Новосибирск  ...           NaN\n",
       "2       г. Москва  ...           NaN\n",
       "3   г. Ставрополь  ...           NaN\n",
       "4    г. Челябинск  ...           NaN\n",
       "\n",
       "[5 rows x 10 columns]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_banki = pd.DataFrame(responses)\n",
    "df_banki[:5]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fbf6bf1d",
   "metadata": {
    "id": "fbf6bf1d"
   },
   "source": [
    "## Часть 1. Анализ текстов [40/100]\n",
    "### 1.1 Подсчет количества отзывов в разных городах и на разные банки "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6d2dba03",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 394
    },
    "id": "6d2dba03",
    "outputId": "2e2e28e5-d239-4164-9e50-c90dc24b7b1c"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>message_count</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>bank_name</th>\n",
       "      <th>city</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>Сбербанк России</th>\n",
       "      <th>г. Москва</th>\n",
       "      <td>8146</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Альфа-Банк</th>\n",
       "      <th>г. Москва</th>\n",
       "      <td>3871</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Русский Стандарт</th>\n",
       "      <th>г. Москва</th>\n",
       "      <td>2801</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>ВТБ 24</th>\n",
       "      <th>г. Москва</th>\n",
       "      <td>2714</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Сбербанк России</th>\n",
       "      <th>г. Санкт-Петербург</th>\n",
       "      <td>2434</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Московский Кредитный Банк</th>\n",
       "      <th>г. Москва</th>\n",
       "      <td>2348</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Тинькофф Банк</th>\n",
       "      <th>г. Москва</th>\n",
       "      <td>2339</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Хоум Кредит Банк</th>\n",
       "      <th>г. Москва</th>\n",
       "      <td>2143</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Национальный Банк «Траст»</th>\n",
       "      <th>г. Москва</th>\n",
       "      <td>1654</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>Райффайзенбанк</th>\n",
       "      <th>г. Москва</th>\n",
       "      <td>1426</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                              message_count\n",
       "bank_name                 city                             \n",
       "Сбербанк России           г. Москва                    8146\n",
       "Альфа-Банк                г. Москва                    3871\n",
       "Русский Стандарт          г. Москва                    2801\n",
       "ВТБ 24                    г. Москва                    2714\n",
       "Сбербанк России           г. Санкт-Петербург           2434\n",
       "Московский Кредитный Банк г. Москва                    2348\n",
       "Тинькофф Банк             г. Москва                    2339\n",
       "Хоум Кредит Банк          г. Москва                    2143\n",
       "Национальный Банк «Траст» г. Москва                    1654\n",
       "Райффайзенбанк            г. Москва                    1426"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_banki_message_stat = df_banki.groupby(['bank_name', 'city']).count().reindex(columns=['bank_license'])\n",
    "df_banki_message_stat.rename(columns={'bank_license': 'message_count'}, inplace=True)\n",
    "df_banki_message_stat = df_banki_message_stat.sort_values('message_count', ascending=False)\n",
    "df_banki_message_stat[:10]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1e9ed872",
   "metadata": {
    "id": "1e9ed872"
   },
   "source": [
    "### 1.2 Постройте гистограмы длин слов в символах и в словах"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "913b9587",
   "metadata": {
    "id": "913b9587"
   },
   "outputs": [],
   "source": [
    "class TextPrepare:\n",
    "\n",
    "    def __init__(self, pymorphy, stopwords=None, vocab=None):\n",
    "        \"\"\" Подготовка и операции с текстом\n",
    "\n",
    "        Параметры\n",
    "        ---------\n",
    "        pymorphy : MorphAnalyzer\n",
    "          Экземпляр MorphAnalyzer для лемматизации и поиска тегов\n",
    "        stopwords : list\n",
    "          Список стоп-слов, по умолчанию None\n",
    "        vocab : dict\n",
    "          Словарь в формате {слово: {'lemma': лемма, 'pos': pos-тег}}\n",
    "          В данном словаре класс будет искать леммы и pos-теги.\n",
    "          Если слова в словаре не будет, то поиск будет осуществляться через\n",
    "          pymorphy. И неизвестное слово будет добавлено в словарь.\n",
    "          Можно задать пустой словарь, тогда он будет наполняться заново.\n",
    "        \"\"\"\n",
    "\n",
    "        assert isinstance(pymorphy, pymorphy2.MorphAnalyzer), \"pymorphy доджен быть объектом pymorphy2.MorphAnalyzer\"\n",
    "        if stopwords is not None:\n",
    "            assert isinstance(stopwords, list), \"stopwords должен быть объектом list\"\n",
    "        self._stopwords = stopwords\n",
    "        self._pymorphy = pymorphy\n",
    "        if not isinstance(vocab, dict):\n",
    "            self._vocab = {}\n",
    "        else:\n",
    "            self._vocab = vocab\n",
    "\n",
    "\n",
    "    def _lemmatize(self, word):\n",
    "        \"\"\"Лемматизация и поиск pos-тега с помощью pymorphy\"\"\"\n",
    "        parse = self._pymorphy.parse(word)[0]\n",
    "        lemma = parse.normal_form\n",
    "        pos = parse.tag.POS\n",
    "        return lemma, pos\n",
    "\n",
    "\n",
    "    def fit(self, text):\n",
    "        \"\"\" Подготовка текста\n",
    "        \"\"\"\n",
    "        text = text.lower().strip()\n",
    "        text = re.sub(r'[^А-Яа-яA-Za-z ]', ' ', text)\n",
    "        self._tokens = nltk.tokenize.word_tokenize(text, language='russian')\n",
    "        del(text)\n",
    "        self._is_fitted = True\n",
    "\n",
    "\n",
    "    def _get_tokens_and_pos(self, lemmatize=False, stopwords=False, pos_enable=False):\n",
    "        \"\"\" Получение токенов и pos-тегов\n",
    "\n",
    "        Параметры\n",
    "        ---------\n",
    "        lemmatize : bool\n",
    "          Флаг лемматизации. Если True, будет выполнена лемматизация.\n",
    "          По умолчанию False.\n",
    "        stopwords : bool\n",
    "          Флаг фильтрации по стоп-словам. Если True, будет выполнена фильтрация.\n",
    "          По умолчанию False.\n",
    "        pos_enable : bool\n",
    "          Флаг поиска pos-тегов. Если True, в выдачу будет добавлен pos-тег.\n",
    "          По умолчанию False.\n",
    "\n",
    "\n",
    "        Результат\n",
    "        ---------\n",
    "        tokens : list\n",
    "          Если pos_enable = False, формат [токен1, токен2, ... ]\n",
    "          Если pos_enable = True, формат [(токен1, pos1), (токен2, pos2), ... ]\n",
    "        \"\"\" \n",
    "\n",
    "        if not self._is_fitted:\n",
    "            raise Exception(\"Требуется предварительно запустить метод fit\")\n",
    "\n",
    "        tokens = self._tokens \n",
    "\n",
    "        if stopwords and self._stopwords is None:\n",
    "            raise ValueError(\"Не задан список стоп-слов\")\n",
    "        \n",
    "        if stopwords:\n",
    "            tokens = [token for token in self._tokens if token not in self._stopwords]\n",
    "\n",
    "        if lemmatize or pos_enable:\n",
    "\n",
    "            tokens_lemm = []\n",
    "            for token in tokens:\n",
    "                if token not in self._vocab:\n",
    "                    lemma, pos = self._lemmatize(token)\n",
    "                    self._vocab[token] = {'lemma': lemma, 'pos': pos}\n",
    "                if pos_enable:\n",
    "                    tokens_lemm.append((self._vocab[token]['lemma'], \n",
    "                                        self._vocab[token]['pos']))\n",
    "                else:\n",
    "                    tokens_lemm.append(self._vocab[token]['lemma'])\n",
    "\n",
    "            tokens = tokens_lemm\n",
    "\n",
    "        return tokens\n",
    "    \n",
    "    \n",
    "    def get_tokens(self, lemmatize=False, stopwords=False):\n",
    "        \"\"\" Получение токенов\n",
    "\n",
    "        Параметры\n",
    "        ---------\n",
    "        emmatize : bool\n",
    "          Флаг лемматизации. Если True, будет выполнена лемматизация.\n",
    "          По умолчанию False.\n",
    "        stopwords : bool\n",
    "          Флаг фильтрации по стоп-словам. Если True, будет выполнена фильтрация.\n",
    "          По умолчанию False.\n",
    "\n",
    "        Результат\n",
    "        ---------\n",
    "        tokens : list\n",
    "          Формат [токен1, токен2, ... ]\n",
    "        \"\"\"\n",
    "        tokens = self._get_tokens_and_pos(lemmatize=lemmatize, \n",
    "                                          stopwords=stopwords, pos_enable=False)\n",
    "        return tokens\n",
    "\n",
    "\n",
    "    def get_pos_tags(self, tag_filter=None, stopwords=False):\n",
    "        \"\"\" Получение токенов с pos-тегами. Токены будут лемматизированы.\n",
    "\n",
    "        Параметры\n",
    "        ---------\n",
    "        tag_filter : str\n",
    "          POS-тег для фильтрации выдачи. По умолчанию None.\n",
    "        stopwords : bool\n",
    "          Флаг фильтрации по стоп-словам. Если True, будет выполнена фильтрация.\n",
    "          По умолчанию False.\n",
    "\n",
    "        Результат\n",
    "        ---------\n",
    "        tokens : list\n",
    "          Формат: [(токен1, pos1), (токен2, pos2), ... ]\n",
    "        \"\"\"\n",
    "        tokens = self._get_tokens_and_pos(stopwords=stopwords, pos_enable=True)\n",
    "        return tokens\n",
    "\n",
    "    def seriallize(self, path=None):\n",
    "        \"\"\"Сохранение параметров модели\n",
    "        Сохраняется только vocab и stopwords\n",
    "        \"\"\"\n",
    "        model_params = {'vocab': self._vocab, 'stopwords': self._stopwords}\n",
    "        if not path:\n",
    "            path = 'text_prepare_model.json'\n",
    "        with open(path, 'w', encoding='utf-8') as file:\n",
    "            json.dump(model_params, file)\n",
    "\n",
    "\n",
    "    @classmethod\n",
    "    def load_from_file(cls, pymorphy, path):\n",
    "        \"\"\" Создание модели с параметрами из файла\n",
    "\n",
    "        Параметры\n",
    "        ---------\n",
    "        pymorphy : MorphAnalyzer\n",
    "          Экземпляр MorphAnalyzer для лемматизации и поиска тегов\n",
    "        path:\n",
    "          Путь к файлу модели в формате json\n",
    "        \n",
    "        Результат\n",
    "        ---------\n",
    "        text_prep_model : TextPrepare\n",
    "        \"\"\"\n",
    "\n",
    "        with open(path, 'r', encoding='utf-8') as file:\n",
    "            model_params = json.load(file)\n",
    "\n",
    "        return cls(pymorphy=pymorphy, stopwords=model_params.get('stopwords'), \n",
    "                   vocab=model_params.get('vocab'))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "pUNwCBXnY1e-",
   "metadata": {
    "id": "pUNwCBXnY1e-"
   },
   "source": [
    "Найдем статистику количества символов в словах и количества слов в текстах"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9a0d50e1",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 49,
     "referenced_widgets": [
      "71b0f75cda12470fbe1d005de7cd9b68",
      "047167382bc04c2d99d7b55fb3a15a54",
      "adfdf91167464e5183efeb02089b3fbf",
      "94709408bc7347ea970916c3248705bb",
      "8731c6c7f0e7404cac842eea1d1d59bc",
      "02d06e2884af45a98c8a028d0aa75779",
      "9ece8493698c43c38731e3c196354482",
      "d187aa0c7fcc471bba4a132166cf6681",
      "89b5d649d59440c198c997e6cd68398d",
      "9f1dc768751b4cefaa594a254f182e75",
      "9f40dda6e1cc4df9a5c7c4bcc4b9f439"
     ]
    },
    "id": "9a0d50e1",
    "outputId": "45d08fe5-8fd4-4cf1-a9e9-0138c68e199c"
   },
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "71b0f75cda12470fbe1d005de7cd9b68",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/153499 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# Список счетчиков символов в словах\n",
    "symbols_count = []\n",
    "# Список счетчиков слов в текстах\n",
    "words_count = []\n",
    "\n",
    "stopwords = nltk.corpus.stopwords.words('russian')\n",
    "pymorphy = pymorphy2.MorphAnalyzer(lang='ru')\n",
    "\n",
    "# Файл с  параметрами ранее обученной модели\n",
    "model_param_file = 'text_prepare_model.json'\n",
    "# Подгружаем словарь с леммами и pos-тегами из ранее обученного объекта класса TextPrepare\n",
    "# Это поможет не запускать pymorphy2 во время лемматизации\n",
    "model_vocab = TextPrepare.load_from_file(pymorphy=pymorphy, path=model_param_file)._vocab\n",
    "\n",
    "# Список обученных объектов TextPrepare для каждого текста\n",
    "# в порядке их следования в датафрейме\n",
    "text_prep_models = []\n",
    "\n",
    "tokenize_bar = tqdm(total=len(df_banki.text.values))\n",
    "for text in df_banki.text.values:\n",
    "    text_prep_model = TextPrepare(pymorphy=pymorphy, stopwords=stopwords, vocab=model_vocab)\n",
    "    text_prep_model.fit(text)\n",
    "    text_prep_models.append(text_prep_model)\n",
    "    tokens = text_prep_model.get_tokens(lemmatize=False, stopwords=False)\n",
    "    for token in tokens:\n",
    "        symbols_count.append(len(token))\n",
    "    words_count.append(len(tokens))\n",
    "    tokenize_bar.update(1)\n",
    "tokenize_bar.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "61ad5511",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 545
    },
    "id": "61ad5511",
    "outputId": "f3705423-c3c2-4aad-a211-241fd0ac215b"
   },
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAWoAAAEICAYAAAB25L6yAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAWxklEQVR4nO3de7gkdX3n8fdHQBFBEZkol4HxEpMN7qrsPGpidFk1CkJE8xgW1ySiIvKoiWYNBjVR1kTFJGpiNCIqQoIL6HoJi1e8RV0XdEBULl7AHRYQYeQ+alTgu3/U70DT9JnTk3P6nB8z79fz9DPdVb+q+nZV9ad/VdV1JlWFJKlfd1vpAiRJm2ZQS1LnDGpJ6pxBLUmdM6glqXMGtSR1zqCWpM5t8UGdZH2SnybZmOSqJCcm2XGl65KkaW3xQd38dlXtCOwLrAX+bIXrkaSpbS1BDUBVXQF8AngYQJLnJrkoyU1Jvp/khaPtkxyc5LwkNya5JMn+bfgXkvxr66VvbD329SPTrU/yyiQXJrkuyfuSbD8y/qA23+uTfCXJfxhb7slJfj4y78tHxt0jyd8k+X/tCOG4JPccGb8mSY3UdkuSw9u4uyU5ur2Xa5J8IMkuY9NtO1bHMe35fmN1HNLaHz4y7HltfV6X5FNJ9p60HcaXleRFSS5Icr/2evckpye5NsnFSV4wNv1h7X3NvcdK8pB5ljXa9sYkn0uyRxt33yRnJNnQaj4jyZ6T5tPar07y4db+miRvH1nGl0favaLV9KT2+pj2+hkjbV40uv42VWcb/4K2Lq5t62b3sdpGjxx/nuTkaaZtNfy4TXdJkt/dxPvfnLb7JDmzLfOqJK+aZ5vcYfttatu39fiLNs31ST6SZKc27sFtnV2T5EdJ3p9k55Fx1ybZd2QZG5LsN1/9vdmqgjrJauCpwNfboKuBg4B7A88F3jqyMR8F/CNwFLAz8Hhg/cjsXlJVO7ae+m9PWNyzgacADwYeSuvFJ3kkcALwQuB+wLuA05PcY7RU4PVt3geMzffYNr9HAA8B9gBeMzJ+bpvep03/pZFxfwg8HfhPwO7AdcA7JtS+SUm2A/4CuHJk2MHAq4DfAVa15Z4yxbwOBf4EeEpVXdMGnwpc3mp8JvCGJE8YmexuwFdG1v9C/k9r90vAz4A/HpnP+4C9gb2AnwJvn6fObYAzgEuBNQzr/dQJ7XYB/gi4fmzUt4HDR14fBnxvmjrbe38jcAiwW6thfNl3Aw5q079hpJ5ppn14m+51wDsnvP3NatvC8zPAJxm24UOAz47VOt/2W2jbn9am2Qt4IPCcucW297k78O+A1cAxAFV1CfCnwMlJdmDY5idV1RcWeK/dmFlQJzkhydVJzp+y/SEZeqAXJPkfS1zOR5NcD3wZ+BfajlxVH6uqS2rwL8Cngce1aZ4PnFBVZ1bVrVV1RVV9ezOW+faquqyqrgVeDzyrDT8CeFdVnV1Vt1TVSQwfyseMTHtP4OfjM0ySNv0fV9W1VXVTey+HjjS7O3BrVd0yoaYjgVdX1eVV9TOGHfmZGelFT+mFwNnAd8fm/caquqiqbm51PSLz9Kqb/YH3AgdU1eXtPa4GHgv8aVX9a1WdB7wH+IOx93in9TOFu7XHNQBVdU1VfaiqftLW5esZvsQmeRRDCBxVVT9utX15QrtXMXwR3zA2/BzgAUn2bJ2Bq4AfTFMnw5f+CVV1btturwR+PcmakWnmWyfTTDtn25FlLmRTbQ8CflhVb27r6aaqOnuhWqfc9nO24Y7b8uL2Wf1ZVW0A3sLItqyqdwMXM+y3uwGvnvJ9dmGWPeoTGT6IC0ryyww70GOrah/gZUtcy9Oraueq2ruqXlRVP23LPSDJWe2w6HqG3vaubZrVwCWLWOZlI88vZfiQw9B7e3k7dLu+LXf1yHiABwAbJsxzFbADcM7ItJ9sw+fswtBTnmRv4CMj014E3ALcf6TNj0bGHzI+g9ZbegXw5xPm/Xcj017L0MvZg/m9h+EoZTQcdwfmvoTmXDo2n4nvMcnjRg6nLxgZ9ZhW0/UMvbATW/sdkrwryaVJbgS+COzces/jVgOXti+hidqX0iHAX8/T5H0MR26HM7z3cRPrZFgnl841qqqNDAE1dwonDEd9k7b7Jqdtzk2ykeHo6nXzvb/NaLvQZ2e+fXSabX9IW0cbgB8D/wsgyf2TnJrkirYtT+b2z/KcdzOc9vz79qV1lzGzoK6qLzJ8WG/TzhV9Msk5Sb6U5FfbqBcA76iq69q0V8+qrpFa7gF8CPgb4P5VtTPwcYZwgSFoH7yIRaweeb4Xt/eeLmM4rbHzyGOHqjql1bUdw870jQnz/BHD4fk+I9POneKY81Du2NMddRlD73V02du3c/dzdp0bB3xgwjyOAj5QVZeODb8MeOHYvO9ZVV+ZpxYYjjL+C/D63H5u+AfALnPnHpu9gNEaJ77HqvrS3OF0+8Kfc1Z7P9szfIBPbMNfDvwK8OiqujfD6S24fR8Yf397LXD08RfAX40FzaiTgf8K/GfgYxPGz1fnDxi+CIfiknsxnDabWyd7M/Rwvz9hngtNC7Bv24ceCfxDkr3mf4tTtb0MeNAm5jHfPjrNtv9AW0c7AN8C3tyGvwEo4N+3bfl7jGzHDL/0+luGI7hj2imqu4zlPkd9PPCHVfUfGc5L/kMb/lDgoUn+d+vhTtUTX6S7A/dg+Ga+OckBwJNHxr8XeG6SJ2a4CLfHyBfLNF7cDnN3YTjMOq0NfzdwZJJHZ3CvJAeO7JzPBX4IrBufYVXd2qZ/a5JfAmh1PaU9Xw28FPjoPDUdxxCKe7f2q9q55Wnt1Op7/TzzfmWSfdq875NNXGxqvlRV5wNvY9g3qKrLgK8Ab0yyfYYLrc9nCC6SPJbhPPt873FTiuEIYu4IZCeGL77r23Z67Sam/SrDOflj2zbbvtUy5yHAoxmuOUxeeNX1DL3qN2+qZz6hzlMY9sVHtA7GG4Czq2p9229eC3y6qn4yYV7zTjuh7S3Adgy984Vsqu0ZwG5JXpbh4vdOSR4Nm95+C237MbcyrKfRbbkRuCHDRdijxtr/HbCuqg5n+JI8bor32I+qmtmD4aLL+e35jgwfivNGHhe1cWcAH2HY8A9k+EbeeYlqWA88aZ5xL2Y4V3g98E8MFzL+cmT8M4BvAjcxnN96Shv+BeDwkXZPAtaPLfOVwIVt3icBO4yM3x/4Wht3JfBBhh3t2Qw73y8YdrqNbZ3dChzXpt2e4cP2feBGhtMXf9TGXQi8FdhuZFm31crwxfzfgO+093QJ8IaRbVXAtiPTngwc057v18YfNWne7fXvM/Rybmzb8IRN7Be3Latt968Dz2mv92z7xLWtxiPb8F9r2+HwsfkV8JB5lnUYQ6hsbHWdA/xGG7d7ew8bGXp4LxxfB2Pz2oshYK5hOLp528gyCvjdSfsdw7WAkyfMb3TbzFtnG39kWxfXtnWzZxt+IvDPwO4jbe+wvPmmHVl3P27L/QHw55v4LG1O24cxXEC8jqHjcfQ022++bT/yvuY+GzcwXLD+1TZun7bONjJky8uBy9u4gxl65buMZNHFwLOXOvNm9UgrfCbaBYszquphSe4NfKeqdpvQ7jiGb/n3tdefBY6uqq/NrLgZyvBTvcOr6jObOd1hwJqqOmZs+J4MXyCHLVGJku5Clu3UR1XdCPzfucPhdtj/8Db6oww9NpLsynAqZNL5ti3djxl6U+NuZux8v6Stx+b+LGtqSU5hCN9dM9wo8VqGQ/t3JvkzhsPdUxkumn0KeHKSCxkO/46q239Tu9Woqg/OM/yHDKcsJG2FZnrqQ5K0eFvVnYmSdFc0k1Mfu+66a61Zs2YWs5akLdI555zzo6paNWncTIJ6zZo1rFt3p58BS5LmkWT8JrLbeOpDkjpnUEtS5wxqSeqcQS1JnTOoJalzBrUkdc6glqTOGdSS1DmDWpI6N7O/nrdS1hw96X84mt76Yw9cokokaWnYo5akzhnUktQ5g1qSOmdQS1LnDGpJ6pxBLUmd2+J+nre18+eJ0pbHHrUkdc6glqTOGdSS1DmDWpI6Z1BLUucMaknqnEEtSZ0zqCWpcwa1JHXOoJakzhnUktQ5g1qSOmdQS1Lnpg7qJNsk+XqSM2ZZkCTpjjbnz5y+FLgIuPeMatki+GdGJS21qXrUSfYEDgTeM9tyJEnjpj318bfAK4BbZ1iLJGmCBYM6yUHA1VV1zgLtjkiyLsm6DRs2LFmBkrS1m6ZH/VjgaUnWA6cCT0hy8nijqjq+qtZW1dpVq1YtcZmStPVaMKir6pVVtWdVrQEOBT5XVb8388okSYD/ue2dLPZXG5K01DYrqKvqC8AXZlKJJGki70yUpM4Z1JLUOYNakjrnxUTdgbfAS/2xRy1JnTOoJalzBrUkdc6glqTOGdSS1DmDWpI6Z1BLUucMaknqnEEtSZ0zqCWpcwa1JHXOoJakzhnUktQ5g1qSOmdQS1LnDGpJ6pxBLUmdM6glqXMGtSR1zqCWpM4Z1JLUOYNakjpnUEtS5wxqSeqcQS1JnTOoJalzBrUkdc6glqTOGdSS1LltV7oAbVnWHP2xRU2//tgDl6gSacthj1qSOmdQS1LnDGpJ6tyCQZ1k+yRfTfKNJBck+e/LUZgkaTDNxcSfAU+oqo1JtgO+nOQTVXXWjGuTJDFFUFdVARvby+3ao2ZZlCTpdlOdo06yTZLzgKuBM6vq7AltjkiyLsm6DRs2LHWdkrTVmiqoq+qWqnoEsCfwqCQPm9Dm+KpaW1VrV61atdR1StJWa7NueKmq65N8HtgfOH82JW3dFnvDiKQtzzS/+liVZOf2/J7AbwHfnnVhkqTBND3q3YCTkmzDEOwfqKozZluWJGnONL/6+CbwyGWoRZI0gXcmSlLnDGpJ6pxBLUmdM6glqXMGtSR1zqCWpM4Z1JLUOf/PRHXF/3NRujN71JLUOYNakjpnUEtS5wxqSeqcQS1JnTOoJalzBrUkdc6glqTOGdSS1DmDWpI6Z1BLUucMaknqnEEtSZ0zqCWpc/6ZU21R/DOp2hLZo5akzhnUktQ5g1qSOmdQS1LnDGpJ6pxBLUmdM6glqXMGtSR1zqCWpM4Z1JLUOYNakjpnUEtS5wxqSeqcQS1JnVswqJOsTvL5JBcmuSDJS5ejMEnSYJq/R30z8PKqOjfJTsA5Sc6sqgtnXJskiSl61FV1ZVWd257fBFwE7DHrwiRJg806R51kDfBI4OwJ445Isi7Jug0bNixNdZKk6YM6yY7Ah4CXVdWN4+Or6viqWltVa1etWrWUNUrSVm2qoE6yHUNIv7+qPjzbkiRJo6b51UeA9wIXVdVbZl+SJGnUND3qxwK/DzwhyXnt8dQZ1yVJahb8eV5VfRnIMtQiSZrAOxMlqXMGtSR1zqCWpM4Z1JLUOYNakjpnUEtS5wxqSeqcQS1JnTOoJalzBrUkdc6glqTOGdSS1DmDWpI6Z1BLUucMaknqnEEtSZ0zqCWpcwa1JHVuwf+KS9qarDn6Y4uafv2xBy5RJdLt7FFLUucMaknqnEEtSZ0zqCWpcwa1JHXOoJakzhnUktQ5g1qSOmdQS1LnDGpJ6pxBLUmdM6glqXMGtSR1zqCWpM4Z1JLUOYNakjpnUEtS5xYM6iQnJLk6yfnLUZAk6Y6m6VGfCOw/4zokSfNYMKir6ovAtctQiyRpgiU7R53kiCTrkqzbsGHDUs1WkrZ6SxbUVXV8Va2tqrWrVq1aqtlK0lbPX31IUucMaknq3LYLNUhyCrAfsGuSy4HXVtV7Z12YdFe05uiPLWr69cceuESVaEuyYFBX1bOWoxBJ0mSe+pCkzhnUktQ5g1qSOmdQS1LnDGpJ6pxBLUmdM6glqXMGtSR1zqCWpM4Z1JLUOYNakjpnUEtS5wxqSeqcQS1JnVvwz5xKWj7+PWtNYo9akjpnUEtS5wxqSeqcQS1JnTOoJalzBrUkdc6glqTOGdSS1DlveJG2IN4ws2WyRy1JnTOoJalzBrUkdc6glqTOGdSS1DmDWpI6Z1BLUuf8HbWk2/g77D7Zo5akzhnUktQ5g1qSOuc5aklLxnPcs2GPWpI6N1VQJ9k/yXeSXJzk6FkXJUm63YKnPpJsA7wD+C3gcuBrSU6vqgtnXZykrYunTiab5hz1o4CLq+r7AElOBQ4GDGpJXVls0C/WrL4opgnqPYDLRl5fDjx6vFGSI4Aj2suNSb7zb6xpV+BH/8Zpl4P1LY71LY71Lc5M68ubFjX53vONWLJffVTV8cDxi51PknVVtXYJSpoJ61sc61sc61uc3uubzzQXE68AVo+83rMNkyQtg2mC+mvALyd5YJK7A4cCp8+2LEnSnAVPfVTVzUleAnwK2AY4oaoumGFNiz59MmPWtzjWtzjWtzi91zdRqmqla5AkbYJ3JkpS5wxqSercigX1QrelJ7lHktPa+LOTrFnG2lYn+XySC5NckOSlE9rsl+SGJOe1x2uWq762/PVJvtWWvW7C+CR5W1t/30yy7zLW9isj6+W8JDcmedlYm2Vdf0lOSHJ1kvNHhu2S5Mwk32v/3neeaZ/T2nwvyXOWsb6/TvLttv0+kmTneabd5L4ww/qOSXLFyDZ86jzTzvxPUMxT32kjta1Pct480858/S1aVS37g+Gi5CXAg4C7A98Afm2szYuA49rzQ4HTlrG+3YB92/OdgO9OqG8/4IyVWH9t+euBXTcx/qnAJ4AAjwHOXsFt/UNg75Vcf8DjgX2B80eG/RVwdHt+NPCmCdPtAny//Xvf9vy+y1Tfk4Ft2/M3Tapvmn1hhvUdA/zJFNt/k5/1WdU3Nv7NwGtWav0t9rFSPerbbkuvqp8Dc7eljzoYOKk9/5/AE5NkOYqrqiur6tz2/CbgIoY7NO9KDgb+sQZnATsn2W0F6ngicElVXboCy75NVX0RuHZs8Og+dhLw9AmTPgU4s6qurarrgDOB/Zejvqr6dFXd3F6exXAPw4qYZ/1NY5rP+qJtqr6WG4cApyz1cpfLSgX1pNvSx4PwtjZtZ70BuN+yVDeinXJ5JHD2hNG/nuQbST6RZJ9lLQwK+HSSc9rt++OmWcfL4VDm/4Cs5PoDuH9VXdme/xC4/4Q2vazH5zEcIU2y0L4wSy9pp2ZOmOfUUQ/r73HAVVX1vXnGr+T6m4oXEzchyY7Ah4CXVdWNY6PPZTicfzjw98BHl7m836yqfYEDgBcnefwyL39B7QappwEfnDB6pdffHdRwDNzlb1WTvBq4GXj/PE1Wal94J/Bg4BHAlQynF3r0LDbdm+7+s7RSQT3Nbem3tUmyLXAf4JplqW5Y5nYMIf3+qvrw+PiqurGqNrbnHwe2S7LrctVXVVe0f68GPsJwiDmqh1v/DwDOraqrxkes9Pprrpo7HdT+vXpCmxVdj0kOAw4Cnt2+TO5kin1hJqrqqqq6papuBd49z3JXev1tC/wOcNp8bVZq/W2OlQrqaW5LPx2Yu8L+TOBz8+2oS62d03ovcFFVvWWeNg+YO2ee5FEM63JZvkiS3CvJTnPPGS46nT/W7HTgD9qvPx4D3DBymL9c5u3JrOT6GzG6jz0H+OcJbT4FPDnJfduh/ZPbsJlLsj/wCuBpVfWTedpMsy/Mqr7Rax7PmGe5K/0nKJ4EfLuqLp80ciXX32ZZqauYDL9K+C7DFeFXt2GvY9gpAbZnOGS+GPgq8KBlrO03GQ6Dvwmc1x5PBY4EjmxtXgJcwHAV+yzgN5axvge15X6j1TC3/kbrC8N/+HAJ8C1g7TJv33sxBO99Roat2Ppj+MK4EvgFw3nS5zNc8/gs8D3gM8Aure1a4D0j0z6v7YcXA89dxvouZji/O7cPzv0Kanfg45vaF5apvn9q+9Y3GcJ3t/H62us7fdaXo742/MS5fW6k7bKvv8U+vIVckjrnxURJ6pxBLUmdM6glqXMGtSR1zqCWpM4Z1JLUOYNakjr3/wE3lqVIV9Hv1gAAAABJRU5ErkJggg==\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAEICAYAAACktLTqAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAbU0lEQVR4nO3df7QcZZ3n8ffHBAi/JASy2ZBELg4ZWZwdIOceCAd1WYLKD8ewu4iw7hAwTJwj4zCjqwZ1D8wZGcNZ1wjrDhIBCeogCCoZwoIY0JF1CXOzAgKRyYUJ5oYfuUASfvkL+O4f9TRTNN23u+/tvvf205/XOX1u1VNPVT/V1f2pqqeq+yoiMDOzvLxpohtgZmbt53A3M8uQw93MLEMOdzOzDDnczcwy5HA3M8uQw93MLEMO9zaQtFnSryS9IOkpSVdL2mui22Vmvcvh3j5/FBF7AQuAfuBzE9weM+thDvc2i4itwP8G/gBA0tmSNkp6XtKjkj5Sri9psaR7JT0n6RFJJ6TyH0n6dTobeCGdGWwuzbdZ0vmSHpK0XdLXJU0rTX9fWu4OST+V9IdVz/tNSb8tLXuoNG03SV+U9Mt0JvJVSbuXpvdJilLbXpF0Tpr2JknL07o8I+l6STOq5pta1Y4L0/CxVe04LdU/p1T24fR6bpd0m6QDa22H6ueS9FFJD0raL40fIGmNpGclDUr6k6r5z0rrVVnHkHRwnecq131O0h2S5qRp+0q6WdJwavPNkubWWk6qP0/Sd1P9ZyR9pTTtWEmvltr0qqTj07R9JF2T5ntM0uckvalR+1pZl6p6B5Ta8VtJvyuNvzPVqfseTO/fStuPlPSEpCPT+BRJn0nvoeclbUivy9+n5b9Y9f77appveWmehyT9h9LzXSbpxtL4xZLWSVK9bdH1IsKPMT6AzcDxaXge8CDw12n8ZOD3AAH/DngJWJCmHQnsBN5NsaOdAxySpv0IOKf0HMcDm6ue84H0fDOA/wN8Pk07AtgGHAVMAZak+ruV5v8WcEEaPhYYKk1bCaxJy90b+HvgC6XpbwUCmFLdVuA84G5gLrAbcDlwbZrWl+abWlrWN4ELq9sB7AI8DDxeWvZiYBD4N8BUirOjn9bZJq89F3A68CgwtzT9H4C/BaYBhwPDwHGl6R8GflIaD+DgOs91FnBXGp5GsXP/YhrfD/hPwB7ptfwO8P06y5kC3Jde/z3Tst5Rmn4csKXO++4a4Kb0HH3APwFLG7WvlXUZ4f1/IfDNqrIR34OVtqdtuRVYVJr3k8DPgbdRfG4OA/artW2rnvMDwAEUn6UPAi8Cs9O0PdJrchbwTuDp8vshx4eP3Nvn+5J2AHcBPwb+BiAi1kbEI1H4MfADijcXwFLgqoi4PSJejYitEfGLFp7zKxGxJSKeBS4Czkjly4DLI2J9RLwSEauB3wALS/PuDvy2eoHpSGYZ8JcR8WxEPJ/W5fRStV2BVyPilRpt+lPgsxExFBG/ofjgn1o+Wm/SR4D1FB/I8rK/EBEbI+Ll1K7D6x29JycAVwInRsRQWsd5wDHApyPi1xFxL3AFcGbVOr7h9WnCm9LjGYCIeCYiboyIl9JreRHFTr6WIynC6ZMR8WJq212N2iRpCsX2OT8ino+IzcD/AP64UftaWZcWNfMePJDi8/C5iFhXKj8nlT2cPjf3RUTDNkTEdyLi8fRZug7YRPGaEhEvUbweX6I4oPhY5f2QK4d7+5wSEdMj4sCI+GhE/ApA0omS7k6n/zuAk4D90zzzgEfG8JxbSsOPUQQDFB+aT6TT4R3peeeVpgP8a4qj1WozKY5yNpTmvTWVV8wAttdp04HA90rzbgReAWaV6jxdmn5a9QIk7Q18CvhvNZZ9SWneZymO7Gp2MSRXUBwllgP1AKCy46p4rGo5NddR0jtL3QEPliYtTG3aARwEXJ3q7yHp8tRV8hzFGcP0FMjV5gGPpR1XLfVe9/0pznQeG2F9aravjlbq1tPMe/B/AkMUZ65lo/pcSDqz1A20g6JrtPJZIyLWU5zBCbi+1eV3G4d7B0naDbgR+CIwKyKmA7dQvLmgCOffG8NTzCsNv4WiC6Oy3IvSzqby2CMirk3t2oXijX9fjWU+DfwKeHtp3n2iuFhc8fu8/oi6bAvFUXL5uadFcS2iYv/KNGp/yD4JXB8Rj1WVbwE+UrXs3SPip3XaAsXZzAeBi0p93Y8DM9JOpOItFN0DI65jRPwkIvZKj7eXJt2d1mcaxZHh1an8ExTdC0dFxJuBd6XyWn29W4C3jHCWU+91fxr4HUWg1lufeu2rpZW69Yz4Hkz+O7AIOFLS+6vmbelzkc7evgb8GUUXznSKbkuV6pxL0VX4OMXBQ9Yc7p21K8WbaRh4WdKJwHtK068Ezpa0SMWFyDmSDmlh+edKmqviguVngetS+deAP5V0lAp7Sjq5FGZnA08CA9ULjIhX0/wrJf0rgNSu96bheRT96t+v06avUgTpgan+TEmLW1invVP7Lqqz7PMlvT0tex9JH2iwvJ9ExAPApcCqtI5bgJ8CX5A0LV3oW0oRZEg6BjhlhHUcSVCcqVTOdPam2FnuSNvpghHmvQd4AliRttm01BYkHUpxHeANbUrdY9dTvO57p9f+45X1adC+VtalFY3eg1Bsm5coXvu/lTQ9lV8B/LWk+WneP1S6ED6CPVN7h6G4kYF0U0Ma/33g88B/oeie+ZSkw0exXl3D4d5B6bT/zyk+eNuB/0xxobIy/R6KIFtJcWH1x7z+6KuRv6Pos3yU4jT282m5A8CfAF9JzztIcSEJSR+iuMh5EPC8pBcoLpodULnrAPh0mufu1JXwQ4qjT4DbKC6grqzTpkvSOv5A0vMUF1ePamGd3gxcGhFv6H6IiO8BFwPfTu16ADixyeWuAGZLWpLGz6C4MPc48D2Ki8s/TCG6Gvivafs06+j0Wu4E/iPFESTAlymubzxN8VrcWm8BKaT/CDgY+CVFl8UHJe1JsZ0vj4h63Qkfo7iA+CjFdZ+/A65qon2trEvTRnoP1qj7Y4qdVuU99SWKz8wPgOcoDoJ2rzVvaRkPUVxn+L/AU8C/pbjJgHQm9E3g4tR/vwn4DPCNdHadJUX4n3V0IxW3RZ4TET9scb6zgL6IuLCqfC7F3TZntamJZjaBfOTee16kOBqq9jLFBUozy4CP3LvUaI/czaw3ONzNzDLkbhkzswy1+q3Bjth///2jr69vopthZtZVNmzY8HRE1LxVdVKEe19fHwMDb7jl2szMRiCp+ot+r3G3jJlZhhzuZmYZcribmWXI4W5mliGHu5lZhhzuZmYZcribmWXI4W5mliGHu5lZhhzuo9S3fO1EN8HMrC6Hu5lZhhzuZmYZcribmWXI4W5mliGHu5lZhhzuZmYZcribmWXI4d4i399uZt3A4W5mliGHewM+UjezbuRwNzPLkMO9g3zUb2YTxeHeBIe0mXUbh7uZWYYc7m1Q68jeR/tmNpGaCndJ0yXdIOkXkjZKOlrSDEm3S9qU/u6b6krSpZIGJd0vaUFnV8HMzKo1e+R+CXBrRBwCHAZsBJYD6yJiPrAujQOcCMxPj2XAZW1tsZmZNdQw3CXtA7wLuBIgIn4bETuAxcDqVG01cEoaXgxcE4W7gemSZre95ePM3Sxm1k2aOXI/CBgGvi7pZ5KukLQnMCsinkh1ngRmpeE5wJbS/EOp7HUkLZM0IGlgeHh49GswiXgHYGaTRTPhPhVYAFwWEUcAL/IvXTAAREQA0coTR8SqiOiPiP6ZM2e2MquZmTXQTLgPAUMRsT6N30AR9k9VulvS321p+lZgXmn+uaksSz5aN7PJqGG4R8STwBZJb0tFi4CHgDXAklS2BLgpDa8Bzkx3zSwEdpa6b8zMbBxMbbLex4BvSdoVeBQ4m2LHcL2kpcBjwGmp7i3AScAg8FKqmwUfpZtZt2gq3CPiXqC/xqRFNeoGcO4Y29UV6oV9dXnf8rVsXnHyeDTJzAzwN1TNzLLkcG+TytG6u27MbDJwuJuZZcjhbmaWIYe7mVmGHO5mZhlyuI+CL5qa2WTncB9Bu0PcOwUzGy8O9zraFcQOdDObCA73CeDAN7NOc7ibmWXI4T6J+IjezNrF4W5mliGHew2dOIL2UbmZjSeHu5lZhhzuZmYZcribmWXI4W5mliGHO77YaWb5cbibmWXI4W5mliGHu5lZhpoKd0mbJf1c0r2SBlLZDEm3S9qU/u6byiXpUkmDku6XtKCTK9BuE93/PtHPb2Z5aOXI/d9HxOER0Z/GlwPrImI+sC6NA5wIzE+PZcBl7WpsThziZtZJY+mWWQysTsOrgVNK5ddE4W5guqTZY3iejnLImlmOmg33AH4gaYOkZalsVkQ8kYafBGal4TnAltK8Q6ls0pvIoPdOxszaqdlwf0dELKDocjlX0rvKEyMiKHYATZO0TNKApIHh4eFWZs2GA93MOqWpcI+IrenvNuB7wJHAU5XulvR3W6q+FZhXmn1uKqte5qqI6I+I/pkzZ45+DdrEQWtmOWkY7pL2lLR3ZRh4D/AAsAZYkqotAW5Kw2uAM9NdMwuBnaXuGzMzGwfNHLnPAu6SdB9wD7A2Im4FVgDvlrQJOD6NA9wCPAoMAl8DPtr2Vmek1hmDzyLMbKymNqoQEY8Ch9UofwZYVKM8gHPb0roOc4iaWa78DdVx5h2KmY0Hh/sk5Z2AmY2Fw93MLEMOdzOzDDncJwF3wZhZuznczcwy5HA3M8uQw93MLEMOdzOzDDncu4QvuppZKxzuZmYZcrhPYn3L177uiN1H72bWLIe7mVmGHO5mZhlyuHchd8+YWSMO9y7gMDezVjnczcwy5HA3M8uQw93MLEMOdzOzDDnczcwy5HDvMr5zxsya0bPh7pA0s5w1He6Spkj6maSb0/hBktZLGpR0naRdU/luaXwwTe/rTNPNzKyeVo7czwM2lsYvBlZGxMHAdmBpKl8KbE/lK1M96xCfgZhZLU2Fu6S5wMnAFWlcwHHADanKauCUNLw4jZOmL0r1zcxsnDR75P5l4FPAq2l8P2BHRLycxoeAOWl4DrAFIE3fmeq/jqRlkgYkDQwPD4+y+WZmVsvURhUkvQ/YFhEbJB3brieOiFXAKoD+/v5o13J7hbtjzGwkDcMdOAZ4v6STgGnAm4FLgOmSpqaj87nA1lR/KzAPGJI0FdgHeKbtLTczs7oadstExPkRMTci+oDTgTsi4kPAncCpqdoS4KY0vCaNk6bfERE+MjczG0djuc/908DHJQ1S9KlfmcqvBPZL5R8Hlo+tiWZm1qpmumVeExE/An6Uhh8FjqxR59fAB9rQNmtB3/K1bF5x8kQ3w8wmiZ79hqqZWc4c7mZmGXK4Z8C3RZpZNYd7RhzyZlbhcM9cJfAd/Ga9xeFuZpYhh3uG+pav9ZG6WY9zuGfGoW5m4HA3M8uSw93MLEMOdzOzDDnczcwy5HA3M8uQw93MLEMOdzOzDDnczcwy5HA3M8uQwz1j/tEws97lcDczy5DD3cwsQw73HlPuonF3jVm+HO5mZhlqGO6Spkm6R9J9kh6U9Fep/CBJ6yUNSrpO0q6pfLc0Ppim93V2FczMrFozR+6/AY6LiMOAw4ETJC0ELgZWRsTBwHZgaaq/FNieylememZmNo4ahnsUXkiju6RHAMcBN6Ty1cApaXhxGidNXyRJbWuxtZX73c3y1FSfu6Qpku4FtgG3A48AOyLi5VRlCJiThucAWwDS9J3AfjWWuUzSgKSB4eHhsa2FNcVBbtY7mgr3iHglIg4H5gJHAoeM9YkjYlVE9EdE/8yZM8e6ODMzK2npbpmI2AHcCRwNTJc0NU2aC2xNw1uBeQBp+j7AM21prZmZNaWZu2VmSpqehncH3g1spAj5U1O1JcBNaXhNGidNvyMiop2NNjOzkU1tXIXZwGpJUyh2BtdHxM2SHgK+LenzwM+AK1P9K4FvSBoEngVO70C7bYzc/26Wt4bhHhH3A0fUKH+Uov+9uvzXwAfa0jobF33L17J5xckT3QwzayN/Q9XMLEM9Ge7ukqjPr41ZHnoy3M3Mcudwt9f4qN0sHw53q8lBb9bdHO4GOMzNcuNwt7oc+Gbdy+HegxzaZvlzuJuZZcjhbmaWIYe7mVmGHO5mZhlyuJuZZcjhbmaWIYe7vYFvlTTrfg53a6hv+VoHvlmXcbhb0xzwZt3D4W4tccCbdQeHu5lZhhzuZmYZcrhby9w1Yzb5OdzNzDLUMNwlzZN0p6SHJD0o6bxUPkPS7ZI2pb/7pnJJulTSoKT7JS3o9EqYmdnrNXPk/jLwiYg4FFgInCvpUGA5sC4i5gPr0jjAicD89FgGXNb2Vtu4qdcF464Zs8mtYbhHxBMR8f/S8PPARmAOsBhYnaqtBk5Jw4uBa6JwNzBd0uy2t9zMzOpqqc9dUh9wBLAemBURT6RJTwKz0vAcYEtptqFUVr2sZZIGJA0MDw+32GwzMxtJ0+EuaS/gRuAvIuK58rSICCBaeeKIWBUR/RHRP3PmzFZmNTOzBpoKd0m7UAT7tyLiu6n4qUp3S/q7LZVvBeaVZp+byiwz7nc3m7yauVtGwJXAxoj4UmnSGmBJGl4C3FQqPzPdNbMQ2FnqvjEzs3EwtYk6xwB/DPxc0r2p7DPACuB6SUuBx4DT0rRbgJOAQeAl4Oy2ttjMzBpqGO4RcRegOpMX1agfwLljbJeZmY2Bv6FqZpYhh7uNiS+qmk1ODncbs3LAV4b935vMJlbPhbsDp/OqX2O/5mbjr+fC3cysFzRzK6RZQ/6BMbPJxUfuZmYZcrjbhPARvVlnOdxt3DjQzcaPw93MLEMOdxsXzRy1+8jerH0c7jbhHOpm7edwt3FX/harmXWGw93MLEMOdxtXPlo3Gx8Od5tQ/marWWc43G3ScbCbjZ3D3SYtX3g1Gz2Hu01qDnaz0XG426Tgo3Sz9nK4m5llyOFuZpahhuEu6SpJ2yQ9UCqbIel2SZvS331TuSRdKmlQ0v2SFnSy8dZ73G1j1pxmjtyvBk6oKlsOrIuI+cC6NA5wIjA/PZYBl7WnmdbrHOpmrWkY7hHxD8CzVcWLgdVpeDVwSqn8mijcDUyXNLtdjbXe1mzAe0dgNvo+91kR8UQafhKYlYbnAFtK9YZSmZmZjaMxX1CNiACi1fkkLZM0IGlgeHh4rM2wHlLrtkkfrZu93mjD/alKd0v6uy2VbwXmlerNTWVvEBGrIqI/Ivpnzpw5ymaYjRzsDn3rVaMN9zXAkjS8BLipVH5mumtmIbCz1H1jZmbjZGqjCpKuBY4F9pc0BFwArACul7QUeAw4LVW/BTgJGAReAs7uQJvNzKyBhuEeEWfUmbSoRt0Azh1ro8ya5W4Xs9r8DVXrSo1+B756eqNxs9z0VLj7A5230dwH7/eE5aqnwt16S/VRfKMgd9BbThzu1rP8M8OWM4e7Ge6qsfw43M3MMtQz4e6jMRsrv4esm/RMuJvB2C+q+pcprVv0RLj7g2ataub3atxPb5NZT4S72WiM5gfJ+pavddDbpOBwNxtBK900DnWbTBr+toxZr3NoWzfykbvZGLUa/v6dGxsPDnezcTDWHYBZq7IPd39IbKKMx101/i9UVk/24W42keoFe/VdNeXxZkLZwW2NZBvufvNbN2j2fvpWbrGsV9dH+b0l23AHv2Gt+3TqZ4n9Weg9WYe7WbfrVB99ozMB7wy6n8PdrMuMdKG2Hd0xta4FtHpNwCZeVuHuN5/1olaDfaT77DvVr9/svf21ltvsMhuV95qswt3MRm80X65q1M1TfcQ/1rMIa57D3czGpNZOoZm7e0YK7tFeWG52ZzDeO4qJ2DF1JNwlnSDpYUmDkpZ34jnq8d7drHPa8fka6X7/Rs81UrdNMzuLWt8vqFc+2u6hyfKb/4qI9i5QmgL8E/BuYAj4R+CMiHio3jz9/f0xMDAw5ud2sJtZPZtXnNxURjRTr1ad6rLKePlvWaVudXkrJG2IiP6a0zoQ7kcDF0bEe9P4+QAR8YV687Qj3B3sZtaNOhXunfjJ3znAltL4EHBUjUYtA5al0RckPTzK59sfeHqU83Yrr3Nv8Dr3AF08pnU+sN6ECfs994hYBawa63IkDdTbc+XK69wbvM69oVPr3IkLqluBeaXxuanMzMzGSSfC/R+B+ZIOkrQrcDqwpgPPY2ZmdbS9WyYiXpb0Z8BtwBTgqoh4sN3PUzLmrp0u5HXuDV7n3tCRdW773TJmZjbx/A1VM7MMOdzNzDLU1eE+kT9z0EmS5km6U9JDkh6UdF4qnyHpdkmb0t99U7kkXZpeh/slLZjYNRgdSVMk/UzSzWn8IEnr03pdly7QI2m3ND6YpvdNZLtHS9J0STdI+oWkjZKO7oFt/JfpPf2ApGslTctxO0u6StI2SQ+UylretpKWpPqbJC1ppQ1dG+7pZw7+F3AicChwhqRDJ7ZVbfMy8ImIOBRYCJyb1m05sC4i5gPr0jgUr8H89FgGXDb+TW6L84CNpfGLgZURcTCwHViaypcC21P5ylSvG10C3BoRhwCHUax7tttY0hzgz4H+iPgDihsuTifP7Xw1cEJVWUvbVtIM4AKKL4EeCVxQ2SE0JSK68gEcDdxWGj8fOH+i29Whdb2J4rd6HgZmp7LZwMNp+HKK3++p1H+tXrc8KL4PsQ44DrgZEMW39qZWb2+KO7GOTsNTUz1N9Dq0uL77AP9c3e7Mt3Hl2+sz0na7GXhvrtsZ6AMeGO22Bc4ALi+Vv65eo0fXHrlT+2cO5kxQWzomnYoeAawHZkXEE2nSk8CsNJzDa/Fl4FPAq2l8P2BHRLycxsvr9Nr6puk7U/1uchAwDHw9dUVdIWlPMt7GEbEV+CLwS+AJiu22gby3c1mr23ZM27ybwz17kvYCbgT+IiKeK0+LYleexX2skt4HbIuIDRPdlnE0FVgAXBYRRwAv8i+n6UBe2xggdSksptixHQDsyRu7LnrCeGzbbg73rH/mQNIuFMH+rYj4bip+StLsNH02sC2Vd/trcQzwfkmbgW9TdM1cAkyXVPmiXXmdXlvfNH0f4JnxbHAbDAFDEbE+jd9AEfa5bmOA44F/jojhiPgd8F2KbZ/zdi5rdduOaZt3c7hn+zMHkgRcCWyMiC+VJq0BKlfMl1D0xVfKz0xX3RcCO0unf5NeRJwfEXMjoo9iO94RER8C7gROTdWq17fyOpya6nfVEW5EPAlskfS2VLQIeIhMt3HyS2ChpD3Se7yyztlu5yqtbtvbgPdI2jed9bwnlTVnoi86jPGCxUkU/xjkEeCzE92eNq7XOyhO2e4H7k2Pkyj6G9cBm4AfAjNSfVHcOfQI8HOKuxEmfD1Gue7HAjen4bcC9wCDwHeA3VL5tDQ+mKa/daLbPcp1PRwYSNv5+8C+uW9j4K+AXwAPAN8AdstxOwPXUlxX+B3FWdrS0Wxb4MNp/QeBs1tpg39+wMwsQ93cLWNmZnU43M3MMuRwNzPLkMPdzCxDDnczsww53M3MMuRwNzPL0P8HfC74VqdMAb0AAAAASUVORK5CYII=\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "plt.hist(symbols_count, bins=np.arange(0, 20))\n",
    "plt.title(\"Распределение кол-ва символов в словах\")\n",
    "plt.show()\n",
    "\n",
    "plt.hist(words_count, bins=np.arange(0, 1000))\n",
    "plt.title(\"Распределение кол-ва слов в текстах\")\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "81ya1kgaABHR",
   "metadata": {
    "id": "81ya1kgaABHR"
   },
   "source": [
    "# 1.3 Поиск самых частых значений"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "aSSevtDNAOVo",
   "metadata": {
    "id": "aSSevtDNAOVo"
   },
   "outputs": [],
   "source": [
    "def get_object_counts(text_prep_models, top_n=None, stopwords=False, \n",
    "                      lemmatize=False, print_result=False, pos_tag=None):\n",
    "\n",
    "    stat_tokens = Counter()\n",
    "    for text_prep_model in text_prep_models:\n",
    "        if pos_tag:\n",
    "            pos_tag = str(pos_tag).upper()\n",
    "            tokens = text_prep_model.get_pos_tags(stopwords=stopwords)\n",
    "            tokens = [token for token, pos in tokens if pos == pos_tag]\n",
    "        else:\n",
    "            tokens = text_prep_model.get_tokens(stopwords=stopwords, lemmatize=lemmatize)\n",
    "        stat_tokens.update(tokens)\n",
    "\n",
    "    stat_tokens = sorted(stat_tokens.items(), key=lambda x: x[1], reverse=True)\n",
    "    if top_n and len(stat_tokens) > top_n:\n",
    "        stat_tokens = stat_tokens[:top_n]\n",
    "\n",
    "    if print_result:\n",
    "        for position, (word, count) in enumerate(stat_tokens, 1):\n",
    "            print(f'{position}: {word} (частота {count})')\n",
    "\n",
    "    return stat_tokens"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "zYb4VR_cFJLf",
   "metadata": {
    "id": "zYb4VR_cFJLf"
   },
   "source": [
    "## 1.3.1 Десять самых частых слов"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bTJusqU4Dahw",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "bTJusqU4Dahw",
    "outputId": "62971088-d014-4fb6-d3de-b9fc7ebb7290"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1: в (частота 1330535)\n",
      "2: и (частота 1058982)\n",
      "3: не (частота 855332)\n",
      "4: на (частота 707931)\n",
      "5: что (частота 640057)\n",
      "6: я (частота 577484)\n",
      "7: с (частота 463754)\n",
      "8: по (частота 372765)\n",
      "9: мне (частота 320958)\n",
      "10: банка (частота 275899)\n"
     ]
    }
   ],
   "source": [
    "_ = get_object_counts(text_prep_models=text_prep_models, \n",
    "                      top_n=10, print_result=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "QKVHlzNaFOkx",
   "metadata": {
    "id": "QKVHlzNaFOkx"
   },
   "source": [
    "## 1.3.2 Десять самых частых слов без стоп-слов\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "Vou6G2C2FRqu",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "Vou6G2C2FRqu",
    "outputId": "f933aec9-ac2a-4bc9-a663-399125b9c70d"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1: банка (частота 275899)\n",
      "2: банк (частота 215411)\n",
      "3: это (частота 183365)\n",
      "4: деньги (частота 131200)\n",
      "5: карту (частота 105827)\n",
      "6: карты (частота 101821)\n",
      "7: т (частота 91715)\n",
      "8: кредит (частота 81119)\n",
      "9: день (частота 76552)\n",
      "10: банке (частота 72162)\n"
     ]
    }
   ],
   "source": [
    "_ = get_object_counts(text_prep_models=text_prep_models, \n",
    "                      stopwords=True, top_n=10, print_result=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "R9wWW-a3Gvlt",
   "metadata": {
    "id": "R9wWW-a3Gvlt"
   },
   "source": [
    "## 1.3.3 Десять самых частых лемм"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "wTom0ZGgG1zN",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "wTom0ZGgG1zN",
    "outputId": "ed63c73a-0c41-48c5-ad33-326f60fb87c8"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1: банк (частота 668522)\n",
      "2: карта (частота 334994)\n",
      "3: это (частота 200057)\n",
      "4: деньга (частота 179818)\n",
      "5: день (частота 173116)\n",
      "6: мой (частота 163042)\n",
      "7: кредит (частота 156153)\n",
      "8: который (частота 149016)\n",
      "9: отделение (частота 144049)\n",
      "10: клиент (частота 136889)\n"
     ]
    }
   ],
   "source": [
    "_ = get_object_counts(text_prep_models=text_prep_models, \n",
    "                      stopwords=True, lemmatize=True, top_n=10, \n",
    "                      print_result=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "FnIPbgxaJvPK",
   "metadata": {
    "id": "FnIPbgxaJvPK"
   },
   "source": [
    "## 1.3.4 Десять самых частых существительных"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "joCYIG1BFK_G",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "joCYIG1BFK_G",
    "outputId": "028337dc-aec4-4640-e1bd-d465a1041ea8"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1: банк (частота 668522)\n",
      "2: карта (частота 334994)\n",
      "3: деньга (частота 179818)\n",
      "4: день (частота 173116)\n",
      "5: кредит (частота 156153)\n",
      "6: отделение (частота 144049)\n",
      "7: клиент (частота 136889)\n",
      "8: сотрудник (частота 134168)\n",
      "9: счёт (частота 126894)\n",
      "10: сумма (частота 111232)\n"
     ]
    }
   ],
   "source": [
    "_ = get_object_counts(text_prep_models=text_prep_models, \n",
    "                      stopwords=True, lemmatize=True, pos_tag='NOUN', top_n=10, \n",
    "                      print_result=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "NG5H6vXKJj2z",
   "metadata": {
    "id": "NG5H6vXKJj2z"
   },
   "source": [
    "## 1.4 Построение кривых Ципфа и Хипса\n",
    "### 1.4.1 Кривая Ципфа\n",
    "\n",
    "Зако́н Ци́пфа — эмпирическая закономерность распределения частоты слов естественного языка: если все слова языка (или просто достаточно длинного текста) упорядочить по убыванию частоты их использования, то частота n-го слова в таком списке окажется приблизительно обратно пропорциональной его порядковому номеру n (так называемому рангу этого слова). Например, второе по используемости слово встречается примерно в два раза реже, чем первое, третье — в три раза реже, чем первое, и так далее. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "CZ02DXFjJiG0",
   "metadata": {
    "id": "CZ02DXFjJiG0"
   },
   "outputs": [],
   "source": [
    "def plot_zakon_cipfa(text_prep_models, size=200):\n",
    "    stat_tokens = get_object_counts(text_prep_models, top_n=size, \n",
    "                                    stopwords=True, lemmatize=True)\n",
    "    a_param = 0.07\n",
    "    stat_tokens = [(word, a_param/rang) for rang, (word, count) in enumerate(stat_tokens, 1)]\n",
    "\n",
    "    for rang, (word, _) in enumerate(stat_tokens):\n",
    "        print(f\"Ранг {rang}: {word}\")\n",
    "\n",
    "    f_val = [val for _, val in stat_tokens]\n",
    "    plt.plot(np.arange(1, len(stat_tokens)+1), f_val)\n",
    "    plt.xticks(np.arange(1, len(stat_tokens)+1))\n",
    "    plt.title(\"Кривая Ципфа\")\n",
    "    plt.xlabel(\"Ранг слова\")\n",
    "    plt.grid()\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "STYgqu_DMOui",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 474
    },
    "id": "STYgqu_DMOui",
    "outputId": "a27ed04e-eca2-4655-bde1-771c2b4d8cc6"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Ранг 0: банк\n",
      "Ранг 1: карта\n",
      "Ранг 2: это\n",
      "Ранг 3: деньга\n",
      "Ранг 4: день\n",
      "Ранг 5: мой\n",
      "Ранг 6: кредит\n",
      "Ранг 7: который\n",
      "Ранг 8: отделение\n",
      "Ранг 9: клиент\n"
     ]
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXoAAAEWCAYAAABollyxAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAgAElEQVR4nO3de3xU9Z3/8dcn9xskkIThTkBQiahoEKRVC2IsdFvpBVvdLbqtLmtbdtu6rWu7rW1tu7u0Xa2/6vay6mrVFm+1pS3ipQLWbkVEUcNNCSgXuZMAIYTcPr8/5gTHGEhIJjmTyfv5eOSROed8z5zPRHyfme+c+Yy5OyIikrxSwi5ARES6l4JeRCTJKehFRJKcgl5EJMkp6EVEkpyCXkQkySnoRUSSnIJeQmNmb5rZJTHLw8xsk5n9IMy6ToaZTTOzbW2sX2Zm13bTMd/1dxNpj4JeEoKZFQNPA4+7+w1h1yOSTBT0EjozKwCeBF4A5ses/7aZPWJmD5rZITN7yczOjtl+7JmtmeWZ2S4zey5mu5vZYTOrMbNKM7s8ZtuNwbpDZrbWzD4Wsy3FzP7bzPYE+9aZ2bIuPkY3s7HB7ZFmdsTM7g+WS4LtNTE/DWb27WC7mdmtZrbXzO4BDJhtZlvN7BUzK405zm3B+oNmtsrMLuxK3ZIcFPQStjzgcSAN+Ky/tyfHbOBhYCDwK+C3Zpbexv18FWhoY/3Z7p4H3Az8NGZ9JXAhkA98B7jfzIYE2y4FPgacFew7n/j6LrCvjfUF7p4XHPPBmPWfAD4ElBL9WwwFCoHTgAeA+2LGrgQm8s7f62Ezy4pz/dLLKOglbD8FaoDhwPvb2L7K3R9x9wbgFiALOD92gJkNBq4Jth9PGjHh6u4Pu/vb7t7s7g8CbwCTY+8WSO3E4zkhMzsLmArcexK7fQR4wN13u/sfge3A3e5eC9wKnGlmJQDufr+773P3Rnf/LyCT6AlB+jAFvYRtPdEguwG408yyW23f2nLD3ZuBbUSf0cb6FvATYH8b9/+SmdUAdxB9Vg+AmV1lZqvNrNrMqoEJQFGw+Umiz5LfMLODwP/r7INrwwLgm7T96uN4IsCetjYEJ8BqYDCAmX3FzNaZ2YHgceXzzuOSPkpBL2H7vrvXufv/EA3177baPqLlhpmlEH3m/3bM9lOBDwK3Hef+zw2mQs4B/juYHx8F/A/RKZlCdy8AKog+i285oTxENFxHAP/ctYd4zMVEp1weOsn99nCcsA6msQqAXcF8/A3AJ4EBweM6QPC4pO9S0Esi+QdgnpnFTqGUmdnHzSwN+BJwFHg+Zvs3gJvdva6d+24CWkIxF3CCZ8lm9hmiz+gJltOAO4Evu/uBjhRuZlmxP0T/30ozs9jpn28DN7TxPkR7FgN/a2bFZvY3wDDgM2aWA3yR6KuiN4F+QGPwuNLM7Cag/0keS5JQWtgFiLRw901BOP2vmZ0TrP4d8Cmic9obgY8H0xUt9gK/PMHdvmJmDhwE/sPdXwUws/8C/go0B/v/JWafG4A33f3RDpY+DDjSxvoLgTrgnmD5ZXdf1sH7jPUQcAHRQP890Vc0+4DXiU7b/J27u5k9ASwJ1h8mOn+/tc17lD7F9MUjkqiCywvHuvunw66lM8zsTuA5d78nzvf7JnCtuz8dz/uV5KWpG5HuU8fJvekq0i00dSPSTdw93tffi3SKpm5ERJKcpm5ERJJcwk3dFBUVeUlJSaf3P3z4MLm5ufErqJfWoDpUR6LXoDriW8eqVav2untxmxvdPaF+ysrKvCuWLl3apf3jIRFqcFcdramOxKrBXXW01pU6gBf9OLmqqRsRkSSnoBcRSXIKehGRJKegFxFJcgp6EZEk16GgN7OZZrbBzDaa2Y1tbM8Mvu5to5mtaPkSBDP7u6Dnd8tPs5lNjO9DEBGRE2k36IM2q3cAs4h+ldmVsd9RGbgGqHL3sUQ75i0AcPcH3H2iu08E5gKb3X11PB+AiIicWEee0U8GNrr7JnevBxYS/R7PWLN556vRHgFmmFnrLzu4Mti3W2yvPsJ/Pr6eqrrm7jqEiEiv1G6vGzObA8x092uD5bnAFI9p2GRmFcGYbcFyZTBmb8yYSmC2u1e0cYx5wDyASCRStnDhyZ8Pttc082/PHeFTpzizxuWd9P7xVFNTQ15euDWoDtWR6DWojvjWMX369FXuPqnNjcf7JFXLDzAHuDNmeS5we6sxFcDwmOVKoChmeQrwWnvH8i58Mra5udkv+sEz/uEfPt6p/eMpGT5lF0+q490SoY5EqMFddbQW5idjtxPzvZ1Ev7Nz+/HGBF/Dlk/0G3BaXAH8ugPH6jQzo3x8hHX7mqg52tidhxIR6VU6EvQrgXFmNtrMMoiG9qJWYxYBVwe35wDPBGeYli90/iTdOD/forw0QqPD8g17uvtQIiK9RrtB7+6NwHzgCWAd8JC7rzGzm83ssmDYXUChmW0ErgdiL8G8CNjq7pviW/p7lY0aQF46PL1uV3cfSkSk1+hQm2J3X0z0m+hj190Uc7sOuPw4+y4Dzu98iR2XlprC2cVpPLN+Nw1NzaSn6vNgIiJJl4TnRlI5cKSBlW/uD7sUEZGEkHRBf0ZhKhlpKTy1VtM3IiKQhEGflWZcMLaIp9buarm0U0SkT0u6oIfo1Tfbqo6wfuehsEsREQldUgb9jPGDMIOnNX0jIpKcQT+oXxYTRxTwlC6zFBFJzqCH6PTNq9sOsPNAXdiliIiEKnmDfnwEQM/qRaTPS9qgHzsoj5LCHF1mKSJ9XtIGvZlRXhrhr5V7OVTXEHY5IiKhSdqgBygvHUxDk/Ps63vbHywikqSSOujLRg1gYG4GT63dGXYpIiKhSeqgT00xLj590LEmZyIifVFSBz3AJeMjHKxrZOVmNTkTkb4p6YP+olOLyExL4UldfSMifVTSB31ORpqanIlIn5b0QQ/RT8lur1aTMxHpm/pE0M8YH8EMfXhKRPqkPhH0xf0yOWdEgYJeRPqkPhH0AJeURnht+wF2HDgSdikiIj2qzwT9paXRJmfqUS8ifU2fCfpTivMYXZSryyxFpM/pUNCb2Uwz22BmG83sxja2Z5rZg8H2FWZWErPtLDP7q5mtMbPXzCwrfuV3XEuTs+c37VOTMxHpU9oNejNLBe4AZgGlwJVmVtpq2DVAlbuPBW4FFgT7pgH3A9e5+xnANCC0lC0vjdDQ5Cx/fU9YJYiI9LiOPKOfDGx0903uXg8sBGa3GjMbuDe4/Qgww8wMuBR41d1fAXD3fe7eFJ/ST965I1uanGn6RkT6jo4E/TBga8zytmBdm2PcvRE4ABQCpwJuZk+Y2UtmdkPXS+68liZnS9XkTET6EGuvLYCZzQFmuvu1wfJcYIq7z48ZUxGM2RYsVwJTgL8HvgCcB9QCfwK+4e5/anWMecA8gEgkUrZw4cJOP6Camhry8vKOu33VrkZ+8vJRbjgvi9LC1E4fpys19BTVoToSuQbVEd86pk+fvsrdJ7W50d1P+ANMBZ6IWf4a8LVWY54Apga304C9gAFXAPfGjPsm8NUTHa+srMy7YunSpSfcfvhog5/6b4v9W7+r6NJxulJDT1Ed76Y6EqsGd9XRWlfqAF704+RqR6ZuVgLjzGy0mWUE4b2o1ZhFwNXB7TnAM8GBnwDONLOc4I3ZDwBrO3DMbpOTkcaF49TkTET6jnaD3qNz7vOJhvY64CF3X2NmN5vZZcGwu4BCM9sIXA/cGOxbBdxC9GSxGnjJ3f8Y/4dxclqanK3boSZnIpL80joyyN0XA4tbrbsp5nYdcPlx9r2f6CWWCePi0yOYvcZTa3dROrR/2OWIiHSrPvPJ2FjHmpyt03fJikjy65NBD1BeOpiK7Qd5u1pNzkQkufXhoA+anK3Th6dEJLn12aAfOyiPMUW5+pSsiCS9Phv0wLEmZwfV5ExEklifD/qGJmf5BjU5E5Hk1aeD/pyRAyhUkzMRSXJ9OuiPNTnboCZnIpK8+nTQQ3T65lBdIys27Q+7FBGRbtHng/7CccVkpafoMksRSVp9PuizM1K5YGyxmpyJSNLq80EPcGnQ5GztjoNhlyIiEncKemD66YMwQ1ffiEhSUtATbXJ27sgBCnoRSUoK+kB5aYQ1bx9ku5qciUiSUdAHWpqc/UlX34hIklHQB04pzmNMsZqciUjyUdDHUJMzEUlGCvoY5eOjTc6WqcmZiCQRBX0MNTkTkWSkoI+RmmLMGD+IZet3U9+oJmcikhwU9K2Ulw7m0NFGXtisJmcikhwU9K1cMLaIrPQUnlq7M+xSRETiokNBb2YzzWyDmW00sxvb2J5pZg8G21eYWUmwvsTMjpjZ6uDnZ/EtP/6yM1K5cJyanIlI8mg36M0sFbgDmAWUAleaWWmrYdcAVe4+FrgVWBCzrdLdJwY/18Wp7m5VPj7C2wfqWPO2mpyJSO/XkWf0k4GN7r7J3euBhcDsVmNmA/cGtx8BZpiZxa/MnnXxeDU5E5HkYe1NT5jZHGCmu18bLM8Fprj7/JgxFcGYbcFyJTAFyAPWAK8DB4FvuPuf2zjGPGAeQCQSKVu4cGGnH1BNTQ15eXmd3r/F958/wtEmuPn92aHV0FWqQ3Ukcg2qI751TJ8+fZW7T2pzo7uf8AeYA9wZszwXuL3VmApgeMxyJVAEZAKFwboyYCvQ/0THKysr865YunRpl/Zv8bNlG33Uv/7Bt1XVhlZDV6mOd1MdiVWDu+porSt1AC/6cXK1I1M324ERMcvDg3VtjjGzNCAf2OfuR919X3BCWRWcAE7twDFD19Lk7GlN34hIL9eRoF8JjDOz0WaWAVwBLGo1ZhFwdXB7DvCMu7uZFQdv5mJmY4BxwKb4lN69xhTncYqanIlIEmg36N29EZgPPAGsAx5y9zVmdrOZXRYMuwsoNLONwPVAyyWYFwGvmtlqom/SXufuveaTSJcETc4OHFGTMxHpvdI6MsjdFwOLW627KeZ2HXB5G/s9CjzaxRpDc2lphJ8v38SyDbuZPXFY2OWIiHSKPhl7AhNHDKAoT03ORKR3U9CfQGqKMeP0CMs37FGTMxHptRT07SgvjXDoaCMrNu8LuxQRkU5R0LfjgnEtTc40fSMivZOCvh1Z6dEmZ0+ryZmI9FIK+g4oL1WTMxHpvRT0HTDj9EGkqMmZiPRSCvoOKMzLpGzUAAW9iPRKCvoOKi+NsHbHQbZV1YZdiojISVHQd1B56WBATc5EpPdR0HfQ6KLcaJOzdQp6EeldFPQnobx0MCs27VeTMxHpVRT0J6G8NEJjs7Nsw+6wSxER6TAF/Uk4Z0QBRXmZuvpGRHoVBf1JSEkxLhk/SE3ORKRXUdCfpJYmZ89vUpMzEekdFPQn6f1ji8hOT9X0jYj0Ggr6kxRtclbE0+vU5ExEegcFfSeUl0bYoSZnItJLKOg7Ycb4CCkGT2r6RkR6AQV9JwzMzWDSqIGapxeRXkFB30nlpRHW7TjI1v1qciYiia1DQW9mM81sg5ltNLMb29ieaWYPBttXmFlJq+0jzazGzL4Sn7LDd0lpBICn1ftGRBJcu0FvZqnAHcAsoBS40sxKWw27Bqhy97HArcCCVttvAR7vermJY3RRLmMH5Wn6RkQSXkee0U8GNrr7JnevBxYCs1uNmQ3cG9x+BJhhZgZgZh8FNgNr4lNy4igvjbBi834O1KrJmYgkLmvvWnAzmwPMdPdrg+W5wBR3nx8zpiIYsy1YrgSmAHXAU0A58BWgxt1/1MYx5gHzACKRSNnChQs7/YBqamrIy8vr9P4nY2N1E997vo5/PCuTqUPTQqnhRFSH6kjkGlRHfOuYPn36Knef1OZGdz/hDzAHuDNmeS5we6sxFcDwmOVKoAj4EfDJYN23ga+0d7yysjLviqVLl3Zp/5PR1NTsk773lH/+gVWh1XAiquPdVEdi1eCuOlrrSh3Ai36cXE2jfduBETHLw4N1bY3ZZmZpQD6wj+iz+jlm9gOgAGg2szp3v70Dx014LU3Ofv/KDo42NpGZlhp2SSIi79GROfqVwDgzG21mGcAVwKJWYxYBVwe35wDPBCeZC929xN1LgB8D/54sId/ikvERao428vym/WGXIiLSpnaD3t0bgfnAE8A64CF3X2NmN5vZZcGwu4BCM9sIXA+85xLMZPVOk7OdYZciItKmjkzd4O6LgcWt1t0Uc7sOuLyd+/h2J+pLeFnpqVx0ahFPr93Nd2c7wcVGIiIJQ5+MjYPy0sHsPFhHxXY1ORORxKOgj4OLTx9EiqHpGxFJSAr6OBiYm8GkkoHqZikiCUlBHyfl4yOs33lITc5EJOEo6OOkPGhypt43IpJoFPRxUlKUy7hBeepmKSIJR0EfRy1Nzg436LtkRSRxKOjjqLw0QlOz88qeprBLERE5RkEfR2cPL6C4XyYv724MuxQRkWMU9HHU0uTstT1NHKxTj3oRSQwK+jj75KQRNDTDtfe+SF2DpnBEJHwK+jg7Z+QA/uGsTFa+uZ/5v3qJxqbmsEsSkT5OQd8Nzh+Sxs2zJ/D0ut3c8OirNDfrKhwRCU+HulfKyZt7/iiqDtdzy1OvU5CdwTc/PF6dLUUkFAr6bvRPF4+lqraeu/+ymYG56cy/eFzYJYlIH6Sg70Zmxjf/ppTq2gZ+9OTrFORk8OnzR4Vdloj0MQr6bpaSYvxgzlkcPNLAN39XQX52Oh85e2jYZYlIH6I3Y3tAemoKd/zduZw3aiDXP7Sa5a/vCbskEelDFPQ9JCs9lTv/fhJjB/XjuvtWseqtqrBLEpE+QkHfg/pnpfPLz04m0j+Tz96zkg07D4Vdkoj0AQr6HlbcL5P7rplCVnoKc+9aoS8qEZFup6APwYiBOfzys1M42tjMp+9awZ5DR8MuSUSSmII+JKcN7sf/fuY8dh88ylV3v8CBI2qCJiLdo0NBb2YzzWyDmW00sxvb2J5pZg8G21eYWUmwfrKZrQ5+XjGzj8W3/N7t3JED+PncMjbuPsS1967kSL2aoIlI/LUb9GaWCtwBzAJKgSvNrLTVsGuAKncfC9wKLAjWVwCT3H0iMBP4uZnp2v0YF51azK2fmsiLb1XxhV+9RIOaoIlInHXkGf1kYKO7b3L3emAhMLvVmNnAvcHtR4AZZmbuXuvuLd/CkQWou1cbPnzWUL730Qk8s343NzyiJmgiEl/mfuJQMbM5wEx3vzZYngtMcff5MWMqgjHbguXKYMxeM5sC3A2MAua6+2NtHGMeMA8gEomULVy4sNMPqKamhry8vE7vHw+dreH3lfU8+kYD5aPS+NvTM7rcBC0R/haqIzHrSIQaVEd865g+ffoqd5/U5kZ3P+EPMAe4M2Z5LnB7qzEVwPCY5UqgqNWY8cALQNaJjldWVuZdsXTp0i7tHw+draG5udlv/v0aH/Wvf/AfP/V6aHXEm+p4t0SoIxFqcFcdrXWlDuBFP06udmS+fDswImZ5eLCurTHbgjn4fGBfqxPKOjOrASYAL3bguH2OmfFvHxpPdW0Dtz79OgNy07lqaknYZYlIL9eROfqVwDgzG21mGcAVwKJWYxYBVwe35wDPuLsH+6QBmNko4HTgzbhUnqRSUowFnziTS8ZH+NaiNfxudetzqojIyWk36D36Zup84AlgHfCQu68xs5vN7LJg2F1AoZltBK4HWi7BvAB4xcxWA48Bn3f3vfF+EMkmLTWF2//2HCaXDORfHnqFpRt2h12SiPRiHbrU0d0XA4tbrbsp5nYdcHkb+90H3NfFGvukrPRU7rx6Elf+z/N87v5V3H/NFCaVDAy7LBHphfTJ2ATWLyudez4zmaH52Xz2npWs23Ew7JJEpBdS0Ce4orxMfnnNZHIy0rjq7hd4a9/hsEsSkV5GQd8LDB+Qw33XTKahqZm5d73A7oN1YZckIr2Igr6XGBfpxz2fmczemqAJWq2aoIlIxyjoe5GJIwr4xdxJbNpzmM+qCZqIdJCCvpe5YFwRt10xkZe3VPG5B1ZR36gmaCJyYgr6XmjWmUP4/sfOZNmGPXzl4VfUBE1ETkgtg3upKyePpLq2gQVL1lOQk853Ljujy03QRCQ5Keh7ses+MIaq2np+8ewmBuRk8OXyU8MuSUQSkIK+FzMzvjbrdKpr67ntT29QkJPOZ94/OuyyRCTBKOh7OTPj3z92JtW1DXzn92sZkJPBR88ZFnZZIpJA9GZsEkhLTeH/XXkOU8cU8i8Pv8Iz63eFXZKIJBAFfZLISk/lF1eVUTqkP5+7/yVe2Lw/7JJEJEEo6JNItAnaeQwbkM01965ky0F9oEpEFPRJpzAvk/uumUJeZhr/8UIddyzdqE/QivRxCvokNKwgmwfnTeW0Aan88IkNTPvRUha+sIXGJn2KVqQvUtAnqZGFOXypLIuH/nEqQwuyufE3rzHztj/z5JqdLV/WLiJ9hII+yU0ePZDffO59/OzT59Lc7My7bxWX/+yvrHpLb9aK9BUK+j7AzJg5YQhPfvkivv+xCby1v5ZP/PSvzPvli2zcXRN2eSLSzRT0fUhaagp/N2UUy786jX8pP5X/q9zHpbcu52u/eZVd+jITkaSloO+DcjLS+KcZ41j+1Wlc/b4SHlm1jQ/8cCk/fGI9B+v0hSYiyUZB34cV5mXyrY+cwZ+un8YHzxjMHUsr+cAPlnLXc5s52qhLMkWSRYeC3sxmmtkGM9toZje2sT3TzB4Mtq8ws5JgfbmZrTKz14LfF8e3fImHkYU53HbFOfzhny7gjKH5fPcPa5nxX8v57cvb1eteJAm0G/RmlgrcAcwCSoErzay01bBrgCp3HwvcCiwI1u8FPuLuZwJXA/fFq3CJvwnD8rn/2incd81k8rPT+dKDq/nwT57j2df3hF2aiHRBR57RTwY2uvsmd68HFgKzW42ZDdwb3H4EmGFm5u4vu/vbwfo1QLaZZcajcOk+F44r5vfzL+C2KyZysK6Bq+5+gU/fuYKK7QfCLk1EOsHa+/CMmc0BZrr7tcHyXGCKu8+PGVMRjNkWLFcGY/a2up/r3P2SNo4xD5gHEIlEyhYuXNjpB1RTU0NeXl6n94+HRKghXnU0NDtLtzSyqLKemgY4f0gqHx+XwaCcjr+9k0x/j2SpIxFqUB3xrWP69Omr3H1Smxvd/YQ/wBzgzpjlucDtrcZUAMNjliuBopjlM4J1p7R3vLKyMu+KpUuXdmn/eEiEGtzjW8eBI/X+wyXr/bRvLPaxX/+jf+t3Fb73UF2P19EVqiOxanBXHa11pQ7gRT9Ornbkadl2YETM8vBgXZtjzCwNyAf2BcvDgceAq9y9sgPHkwTUPyudr3zwNJZ/dTpzykZw3/Nv8YEfLuMnf3qD2vrGsMsTkRPoSNCvBMaZ2WgzywCuABa1GrOI6JutEH0F8Iy7u5kVAH8EbnT3v8SraAlPpH8W//HxM3niSxfxvlMK+a+nXucDP1zGAyveUtM0kQTVbtC7eyMwH3gCWAc85O5rzOxmM7ssGHYXUGhmG4HrgZZLMOcDY4GbzGx18DMo7o9CetzYQXn84qpJPPq5qYwamMO/PVbBpT9+liUVapomkmg69J2x7r4YWNxq3U0xt+uAy9vY73vA97pYoySwslEDefi6qTy9bjcLlqznuvtXce7IAr72ofGcVzIw7PJEBH0yVuLAzCgvjbDkixey4BNnsr36CJf/7K9ce+9K3th1KOzyRPq8Dj2jF+mItNQUPnXeSC47exj/+3+b+enSSj7442c5d1AqhwfuYNppxeRm6p+cSE/T/3USd9kZqXx+2liuPG8kP11eya+f38QXfvUSmWkpfODUYmadOZgZ4yP0z0oPu1SRPkFBL91mQG4GX//QeM7P3knOqLNYUrGTxyt28OTaXaSnGu8fW8SsCYMpLx3MwNyMsMsVSVoKeul2KWacP6aQ88cUctOHS3l5azVLKnbweMVO/vXR1/j6YxWcP2YgMycM4YNnRBjULyvskkWSioJeelRKilE2agBlowbw9Q+NZ83bB3k8CP1v/raCm35XwaRRA5g5YQgzJwxmWEF22CWL9HoKegmNmTFhWD4ThuXzlUtP443dNSx+bQdLKnby3T+s5bt/WMvZw/OZOWEIsyYMpqQoN+ySRXolBb0kBDPj1Eg/To3040uXnMrmvYd5vCIa+guWrGfBkvWMH9KfWRMGM2vCYMZF+oVdskivoaCXhDS6KJfPTxvL56eNZev+Wp5Ys5PHK3Zyy1Ovc8tTr3NKcS6zgumdM4b2x8zCLlkkYSnoJeGNGJjDtReO4doLx7DrYF009F/byX8v28jtSzcycmAOsyYMZuaEwUwcUaDQF2lFQS+9SqR/FldNLeGqqSXsqznKk2t38XjFTu56bjM/f3YTQ/Kz+OAZ0emdSSUDSU1R6Iso6KXXKszL5MrJI7ly8kgO1Dbw9Lpo6P/qhS3c839vUpSXyaVnRPjQhCFMGaO+O9J3KeglKeTnpPOJsuF8omw4NUcbWbp+N0sqdvLYS9v51YotFOSkM7ZfM6+nVDJxxADOHJZPdkZq2GWL9AgFvSSdvMw0PnL2UD5y9lCO1Dex/PU9PLlmJ8+uf5t/X7wegNQU4/TB/ThnZAETRwxg4ogCxhTlkqKpHklCCnpJatkZqcwM3qhdtqyaM8qm8srWal7eWsXqrdX89uW3uf/5LQD0z0rj7BEFnDNyAOeMKGDiiAIGqDWDJAEFvfQpxf0yuaQ0wiWlEQCamp3KPTWs3hIN/5e3VHP7M2/QHHx3SklhDhOD8J84ooDxQ/qTkabu3tK7KOilT0tNeeeDWp88L/rVyIePNvLqtgPRZ/1bqvlL5T5+u/ptADLSUpgwtD8TRwwIpn0KGD4gW5d0SkJT0Iu0kpuZxtRTCpl6SiEA7s7bB+qiz/q3RKd8HljxFnf/ZTMARXmZwbP+As4ZUcBZIwrIU999SSD61yjSDjNjWEE2wwqy+ZuzhgDQ0NTM+h2HWB1M96zeWs3T63YF4+HUQf2Ohf/EkQWMG9RP1/RLaBT0Ip2QnprCmcPzOXN4PnOnRtdV19azems09F/eUs2SNTt58MWtAORmpHLW8GjoN+1vpHDbAcYU5+obt6RH6F+ZSJwU5GQw7bRBTDttEBCd8nlzX+2x6Z6Xt1TzP89uorHZ+cWrzw6cyy4AAAysSURBVAEwuH8WpwzKZUxRHqcU5zKmOI9TBuUxpH+WLvWUuFHQi3QTM2N0US6ji3L5+LnDATja2MQjS5ZTWDKeyj2HqdxTQ+Wew/z25e0cOtp4bN/s9FRGF+VyyqCYE0Bx9ISgD3rJyepQ0JvZTOA2IBW4093/s9X2TOCXQBmwD/iUu79pZoXAI8B5wD3uPj+exYv0NplpqQzLS2HahCHvWu/u7Kk5SuXuw2zaW3Ps9+qtVfzh1bdxf2fssIJsxhTnckpx7Ekgj0j/TF39I21qN+jNLBW4AygHtgErzWyRu6+NGXYNUOXuY83sCmAB8CmgDvgmMCH4EZE2mBmD+mUxqF/Wsat9WtQ1NPHmvsPR8N9TQ+WeGjbtPczDL27lcH3TsXG5GanvPPMPwv+UQbmUFOaSla5XAX1ZR57RTwY2uvsmADNbCMwGYoN+NvDt4PYjwO1mZu5+GHjOzMbGr2SRviUrPZXTB/fn9MH937Xe3dl18Oix8G+ZClr5ZtWx6/4hehXQ8AHZwfsA0fA/uK+JMftqGZyfpQ+A9QEdCfphwNaY5W3AlOONcfdGMzsAFAJ741GkiLyXmTE4P4vB+Vm8b2zRu7YdqW9i094aNgXh3/L7hc37OdIQfRWwYOVSzKA4L5MhBdkMK8hiaH42QwuyGVqQFfzOpjA3Q1NCvZx57ORfWwPM5gAz3f3aYHkuMCV2vt3MKoIx24LlymDM3mD574FJx5ujN7N5wDyASCRStnDhwk4/oJqaGvLy8jq9fzwkQg2qQ3W0pdmdqrro1UC1ZLK/ztlX5+w70sy+Omd/nRMzGwRAWgoUZhkDs4yBWSkUZtux5cLsFAZmGVlpnTsR6L9J/OqYPn36Knef1Na2jjyj3w6MiFkeHqxra8w2M0sD8om+Kdsh7v4L4BcAkyZN8mnTpnV01/dYtmwZXdk/HhKhBtWhOjpTg7tTXdvA9uoj7DhQx9vVR6I/we1N1Uf46466Y72AWuRnpzM0eFUwpI1XBZF+maSlvneKKBH+Fn2hjo4E/UpgnJmNJhroVwB/22rMIuBq4K/AHOAZb++lgogkHDNjQG4GA3IzmDAsv80xjU3N7Dp09J2TQPU7J4Tt1XWsfLOKA0ca3rVPikW/Hawl+IcG00R7dzXSf0sVxXmZFOVl6tLRbtJu0Adz7vOBJ4heXnm3u68xs5uBF919EXAXcJ+ZbQT2Ez0ZAGBmbwL9gQwz+yhwaasrdkSkF0lLTTnWEuJ4ao42siPmlUDsCeHVbdU8UVFHfVMzAD95+f+O7ZebkUpRv2joF+VlBL8zKeqXSXFeBsXHtmXqU8UnoUN/KXdfDCxute6mmNt1wOXH2bekC/WJSC+Ul5nGuEg/xkX6tbm9udnZd7iePz7zHKNOO5M9NUfZW3OUvYfqo79rjrJpz2Fe2LyfqtqGNu8jOz2Von4xJ4O86MmgKOZkUBScHPIy0/r0G8o6JYpIj0tJMYr7ZVKSn8q00wedcGxDUzP7D9ez51BwMqgJTgYxy1v21fLSW1Xsr62nrUnjzLSUd70yiD0RFPXLZOu+JiI7DjIgJ4OCnPSk+9yBgl5EElp6agqR/llE+me1O7axqZn9tfXvemVw7ORw6Ch7ao6yreoIq7ceYP/ho+96U3nByj8fu52VnkJBdjT0C3LSj50ACnIyKMiOLue/a306BdkZCfuZBAW9iCSNtNSUY58wbk9Ts1NVGz0hPPOXlZScWkp1bQNVtfUcONJA1eF6qo80UF1bz8bdNVTVRm83tr7kKEZuRmr0ZNAS/jEnhnct56aTn53BgJx08rPT27wiKZ4U9CLSJ6Wm2LEpnJ2FqUw7c0i7+7g7h+ubqK6tp7q24diJofpIA9XBiaGqtp4DwfodBw4G4+rfc0lqrH6ZaRTkpnNG/wa64ypPBb2ISAeZGXmZaeRlpjF8QMf3a252Dh1tPHYCaHmlEHuyOHCkgawje7qlbgW9iEg3S0kx8rOj0zQjC3OOO27ZsmXdc/xuuVcREUkYCnoRkSSnoBcRSXIKehGRJKegFxFJcgp6EZEkp6AXEUlyCnoRkSTX7lcJ9jQz2wO81YW7KCL876pNhBpAdbSmOhKrBlAdrXWljlHuXtzWhoQL+q4ysxeP972JfakG1aE6Er0G1dFzdWjqRkQkySnoRUSSXDIG/S/CLoDEqAFUR2uq4x2JUAOojta6pY6km6MXEZF3S8Zn9CIiEkNBLyKS5JIi6M3sbjPbbWYVIdcxwsyWmtlaM1tjZl8MqY4sM3vBzF4J6vhOGHUEtaSa2ctm9ocQa3jTzF4zs9Vm9mKIdRSY2SNmtt7M1pnZ1BBqOC34O7T8HDSzL/V0HUEtXw7+fVaY2a/NrP0veu2eOr4Y1LCmJ/8WbeWWmQ00s6fM7I3g90l8j9XxJUXQA/cAM8MuAmgE/sXdS4HzgS+YWWkIdRwFLnb3s4GJwEwzOz+EOgC+CKwL6dixprv7xJCvlb4NWOLupwNnE8Lfxd03BH+HiUAZUAs81tN1mNkw4J+BSe4+AUgFrgihjgnAPwCTif43+bCZje2hw9/De3PrRuBP7j4O+FOw3GVJEfTu/iywPwHq2OHuLwW3DxH9H3lYCHW4u9cEi+nBT4+/625mw4G/Ae7s6WMnGjPLBy4C7gJw93p3rw63KmYAle7elU+id0UakG1maUAO8HYINYwHVrh7rbs3AsuBj/fEgY+TW7OBe4Pb9wIfjcexkiLoE5GZlQDnACtCOn6qma0GdgNPuXsYdfwYuAFoDuHYsRx40sxWmdm8kGoYDewB/jeYyrrTzHJDqqXFFcCvwziwu28HfgRsAXYAB9z9yRBKqQAuNLNCM8sBPgSMCKGOFhF33xHc3glE4nGnCvpuYGZ5wKPAl9z9YBg1uHtT8PJ8ODA5eInaY8zsw8Bud1/Vk8c9jgvc/VxgFtHptItCqCENOBf4qbufAxwmTi/LO8PMMoDLgIdDOv4Aos9eRwNDgVwz+3RP1+Hu64AFwJPAEmA10NTTdbTFo9e+x+WVuII+zswsnWjIP+Duvwm7nmB6YCk9/x7G+4HLzOxNYCFwsZnd38M1AMeePeLuu4nOR08OoYxtwLaYV1aPEA3+sMwCXnL3XSEd/xJgs7vvcfcG4DfA+8IoxN3vcvcyd78IqAJeD6OOwC4zGwIQ/N4djztV0MeRmRnROdh17n5LiHUUm1lBcDsbKAfW92QN7v41dx/u7iVEpwiecfcef8ZmZrlm1q/lNnAp0ZfrPcrddwJbzey0YNUMYG1P1xHjSkKatglsAc43s5zg/5sZhPSmvZkNCn6PJDo//6sw6ggsAq4Obl8N/C4ed5oWjzsJm5n9GpgGFJnZNuBb7n5XCKW8H5gLvBbMjwN83d0X93AdQ4B7zSyV6Mn8IXcP7fLGkEWAx6JZQhrwK3dfElIt/wQ8EEybbAI+E0YRwQmvHPjHMI4P4O4rzOwR4CWiV6u9THhtCB41s0KgAfhCT71J3lZuAf8JPGRm1xBt1/7JuBxLLRBERJKbpm5ERJKcgl5EJMkp6EVEkpyCXkQkySnoRUSSXFJcXil9l5k1Aa8R/be8Drja3WvDrUoksegZvfR2R4JujBOAeuC6sAsSSTQKekkmfwbGApjZb4MmZmtiG5mZWU3M7Ulmtiy4/W0z2x7Tp31O6zs3s6vM7NWgz/99MevvMbPNwX71ZlZkUT8M+py/ZmafCsZOM7MDwdhNZnZ9sL7EzP5sZi8FP6G0A5DkpKkbSQpBq9tZRBtTAXzW3fcHLSBWmtmj7r6vnbu51d1/dJz7PwP4BvA+d99rZgNjNqcS/R6C3wS9fSD6UfqJRHucFwU1PBts+7O7f9jMzgN+DtxCtKdJubvXmdk4ou0JwuydL0lEQS+9XXZMu4k/E/R7B/7ZzD4W3B4BjAP2tRqfTbRFbkdcDDzs7nsB3D22j3g2UNdq/AXAr929iWijquXAecBBom1xVxN99TE/GJ8O3G5mE4l2Tzy1g3WJtEtBL73dkaAd8zFmNo1od8Sp7l4bTM9ktR5vZpOI9kTvqqGc3JdmtDyjLwJWmdlC4MvALqKvAFJ474lDpNM0Ry/JKB+oCkL+dKJf69hVzwCXB82vaJm6Cb52roT3dqL8M/Cp4Atgiol+u9QLrcbUEn01kBnUvMPdm4k2xkuNQ80igIJektMSIM3M1hHtBvh8V+/Q3dcA3weWm9krwC1mNpRoG9l57l7fapfHgFeBV4ieJG4IWhXDO1M3LwG3uPsB4L+Bq4P7Pp3oF5OIxIW6V4qIJDk9oxcRSXIKehGRJKegFxFJcgp6EZEkp6AXEUlyCnoRkSSnoBcRSXL/HxB0K4XFLrnPAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "plot_zakon_cipfa(text_prep_models=text_prep_models, size=10)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8HRE54kFlekn",
   "metadata": {
    "id": "8HRE54kFlekn"
   },
   "source": [
    "### 1.4.2 Кривая Хипса\n",
    "\n",
    "Закон Хердана-Хипса – имперический закон, согласно которому количество уникальных слов в тексте зависит от длины текста."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "fUUl3vtbMSFa",
   "metadata": {
    "id": "fUUl3vtbMSFa"
   },
   "outputs": [],
   "source": [
    "def plot_zakon_hipsa(k=20, b=0.5):\n",
    "    x = np.arange(1, 10000)\n",
    "    y = x**b * k\n",
    "    plt.plot(x, y)\n",
    "    plt.title(\"Кривая Хипса\")\n",
    "    plt.ylabel(\"Уникальные слова\")\n",
    "    plt.xlabel(\"Длина текста\")\n",
    "    plt.grid()\n",
    "    plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "Afaxc2uCOQeJ",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 295
    },
    "id": "Afaxc2uCOQeJ",
    "outputId": "8ceb4790-fe7d-4089-f171-d0b603ec937a"
   },
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYwAAAEWCAYAAAB1xKBvAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+WH4yJAAAgAElEQVR4nO3deXhV1dn38e/NPIQZjBBmBRWZiYJ1AmctFrHYOrQOtWKtPlVba9W21uqrta3ap9ZqSyt1FlQceUBFSnAEBGSeZxLmmQAJGe73j72jR0zIScjJyTnn97muXNln7ele2ZA7e+211zJ3R0REpDy14h2AiIgkBiUMERGJihKGiIhERQlDRESiooQhIiJRUcIQEZGoKGGIiEhUlDAkKZjZGjM7J+JzhpmtMrM/xTOuijCzi81sk5m1jCgbZmY5ZtYsnrGJgBKGJCEzawN8AEx09zvjHU+03P0d4L/AXwDMrDnwFHCTu++OZ2wioIQhSSb8Jfs+MAO4JaL8PjN7zczGmtleM5ttZn0i1n95h2JmaWa22cw+jljvZrbPzHLNbKWZXRax7q6wbK+ZLTKz4RHrapnZk2a2Ndw3z8yyDlOFnwEXmtn5BIljqru/fUgcx4bLHc3sgJm9EH7uHK7PjfgqMLP7IvYfZmZzzGxPGPMFYfl1ZrY4rMMqM7uxgj96SQFKGJJM0oCJQB3gR/7NcW+GAa8CLYGXgDfNrG4px/klUFBKeR93TwPuJ/jLv8RK4HSgGfB74AUzaxuuOw8YDvQO972Fw3D3bcCtwIvAUIIEUpYHgO2llDd397TwfGNLCs3sZOC5sH7NgTOANeHqLeH5mgLXAX8xs/6Hi1VSjxKGJJOngFygPXBqKetnuftr7l4APAY0AAZFbmBmRwPXh+vLUoeIX9Tu/qq7b3D3YncfCywHTo48LFC7AvWYRpB83nf3raVtYGa9gVOAZytw3OuB0e4+KYw1x92XhHX4P3df6YGpBHdpp1fg2JIClDAkmSwBLgbuBP5tZg0PWb++ZMHdi4FsoN0h2/wO+Buwo5TjzzazXODvBHcZAJjZ1WEzzy4z2wX0BFqHq98HngeWm9ke4PEo6jGK4E7gIjM7pYxt/gj8ltLvhMrSgeBu6BvM7EIzm2ZmO8I6XBRRBxFACUOSy4Punufu/yJIDg8csr5DyYKZ1SK4E9kQsb47cD7w1zKO3z9s5ukHPBk+Q+gE/IugqamVuzcHFhDcVZQkpleAreH5D9fEhJldH273U+AegsRX75DNzgJahcetiPXAMaWcsz4wDngESA/rMKGkDiIllDAkWd0AjAzb7UsMMLNLzawOcBuQT9D8U+I3wP3unlfOsYuAugTPARoDTpAQMLPrCO4wCD/XAf4N3F5eTyczawf8GbjB3fOBfxA0ff36kE3vA+4s5RlNeZ4GrjOzs8OH8RlmdjxQD6gf1qHQzC4kePYi8jVKGJKU3H0VcC/wn4i/0N8Cvg/sBH4IXBo+zyixjaApqCxzwyapLOAP7j7P3RcBjwKfAZuBXsAnEfvcCaxx93FRhP0kMMbdPwrr4ASJ7zYzOzFiuy/cPSuK432Nu88gfKAN7AamAp3cfS/Bnc8rBD+bK4G3yzqOpC7TBEqSCsKupce6+w/iHYtIotIdhoiIREUJQ0REoqImKRERiYruMEREJCp14h1ArLRu3do7d+5cqX337dtH48aNqzagGk51Tn6pVl9QnStq1qxZ29y9TVnrkzZhdO7cmZkzZ1Zq36ysLAYPHly1AdVwqnPyS7X6gupcUWa29nDr1SQlIiJRUcIQEZGoKGGIiEhUlDBERCQqShgiIhKVmCUMM+tgZlPCKSsXmtmtYXlLM5tkZsvD7y3CcjOzx81shZnNi5zty8yuCbdfbmbXxCpmEREpWyzvMAqBX7h7D4JZzW42sx7AXcBkd+8GTA4/A1wIdAu/RhJOgWlmLQkmtRlIMIvZ70qSjIiIVJ+YJQx33+jus8PlvcBiIINgXuWSaSWfBS4Jl4cBz4VTRE4DmofzIp8PTHL3He6+E5gEXBCruEVEEtGevAJen53NWysOxuwc1fLinpl1JpilbDrBjF4bw1WbgPRwOYOIKTQJps/MOEx5aecZSXB3Qnp6OllZWZWKNzc3t9L7JirVOfmlWn0h+eu8r8D5Ykshn28qYuG2IgodWjdwPvjvFOrUqvoJE2OeMMwsjWD6x9vcfY/ZV5VwdzezKhv90N1HEcyHTGZmplf2bUe9HZoaUq3OqVZfSM46795fwHuLNjFx/kY+XrGNgiIno3lDrj31aC7q3ZZdK+dw1pAhMTl3TBOGmdUlSBYvuvvrYfFmM2vr7hvDJqctYXkOEXMuE8y3nBN+DT6kPCuWcYuI1CS79h/k/YWb+b/5G/lkxTYKi4Mkcd2pXbioV1v6tG9GyR/jWatiNxV7zBKGBdE/DSx298ciVr0NXAM8HH5/K6L8FjMbQ/CAe3eYVN4DHop40H0ecHes4hYRqQl27T/Iews3MX7eRj5buZ3CYqdDy4Zcf3oXLurZlt4RSaK6xPIO41SCeZPnm9mcsOwegkTxipldD6wFvheumwBcBKwA9hPMPYy77zCzB4DPw+3ud/cdMYxbRCQu9uYVMGnRZsbP28hHy7dSUOR0bNmIH5/elW/3akvPjKbVniQixSxhuPvHQFk1O7uU7R24uYxjjQZGV110IiI1w/6DhXyweAvj524ga9lWDhYWf9ncNLR3W3plVP+dRFmSdnhzEZGaKq+giKylW3hn3kYmL95MXkExRzWpz1UDOzK0dzv6dWhOrRj0cjpSShgiItUgv7CIj5ZtY/y8DUxatJl9B4to1bgeIwa0Z2jvdpzUuSW1a2CSiKSEISISIwVFxXyyYhvj523kvYWb2JtXSPNGdbm4TzuG9m7HoK4tqVM7cYb0U8IQEalCRcXO9FXbeWfeRt5dsJGd+wtoUr8O556YzsV92nHasa2pm0BJIpIShojIEXJ35mbv5q05OYyft5Gte/NpVK8255yQztDebTmjexsa1K0d7zCPmBKGiEglrdiSy9tzcnhr7gbWbt9Pvdq1OOv4o7i4TzvOOv4oGtZL/CQRSQlDRKQCNu3O4525G3hrbg4LcvZgBt86phU3DzmW8088mmYN68Y7xJhRwhARKcfu/QVMXLCRt+ZsYNrq7bhDn/bN+O3QHlzcuy1HNW0Q7xCrhRKGiEgp8gqKmLx4C2/OySFr6RYKipyurRtz69nd+E6fdnRtkxbvEKudEoaISKiwqJhPVm7nrTk5vL9wM7n5hRzVpD7XnNKZYX0z4j40R7wpYYhISnN35qzfxVtzNjB+3ga25R6kSYM6fLtXW4b1bcfArq1q/At11UUJQ0RS0vod+3nzixze+CKHVdv2Ua9OLc454SiG9c1g8HFtqF8nuXo4VQUlDBFJGXvyCpg4fyPjZucwY3Uw6PXALi35yZnHcEGvo2naIHl7OFUFJQwRSWoFRcV8tHwrr8/OYdKizeQXFtO1TWPuOK87w/pm0KFlo3iHmDCUMEQk6bg7C3L28OLifO74eDLbcg/SolFdLj+pA8P7t//aDHUSPSUMEUkaG3Yd4M05ObwxO4flW3KpY3DuiUdzaf/2nNm9DfXqJOYYTjVFLKdoHQ0MBba4e8+wbCxwXLhJc2CXu/c1s87AYmBpuG6au/8k3GcA8AzQkGBWvlvDyZZERMjNL+TdBZt4fXY2n60KXqrL7NSCB4f3pPmeVXz73AHxDjFpxPIO4xngCeC5kgJ3/37Jspk9CuyO2H6lu/ct5ThPATcA0wkSxgXAxBjEKyIJorjYmbZ6O6/NzGbigk0cKCiiY8tG3Hp2N4b3y6BTq8YAZGWtjnOkySWWU7R+GN45fIMFjYffA8463DHMrC3Q1N2nhZ+fAy5BCUMkJa3fsZ9xs7N5bVY22TsP0KR+HS7pl8GIARn079hCzyVizGLZuhMmjPElTVIR5WcAj7l7ZsR2C4FlwB7gN+7+kZllAg+7+znhdqcDv3L3oWWcbyQwEiA9PX3AmDFjKhV3bm4uaWmp9dq/6pz8ErW++UXOrM1FfJRdwOIdxRjQo1UtTsuoS//02tSvXXaSSNQ6H4kjqfOQIUNmlfxeLk28HnpfAbwc8Xkj0NHdt4fPLN40sxMrelB3HwWMAsjMzPTBgwdXKrisrCwqu2+iUp2TXyLV192ZvW4Xr81az/i5G9mbX0iHlg35+bkduLR/Bu1bRNcVNpHqXFViWedqTxhmVge4FPjySZS75wP54fIsM1sJdAdygPYRu7cPy0QkCW3ek/dlk9OqrftoWLc2F/Y6mssGdGBgl5bU0hAdcRWPO4xzgCXunl1SYGZtgB3uXmRmXYFuwCp332Fme8xsEMFD76uBv8UhZhGJkfzCIj5YtIVXZ63nw2VbKXY4qXMLfnLGMVzUuy1p9dX7v6aIZbfal4HBQGszywZ+5+5PA5fz9eYogDOA+82sACgGfuLuO8J1P+WrbrUT0QNvkaSweOMexn6+njfn5LBrfwFHN23ATYOPYcSADnRp3Tje4UkpYtlL6ooyyq8tpWwcMK6M7WcCPUtbJyKJJTe/kHfmbmDMjHXMzd5Nvdq1OO/EdC7L7MBpx7bWqLA1nO71RCSmSoYPHzNjPe/M28D+g0V0OyqN3w7twaX9MmjRuF68Q5QoKWGISEzs2n+QN77IYezn61myaS8N69bm4j5t+f5JHenfsbnemUhAShgiUmXcnemrdzBmxjomLNjEwcJierdvxkPDe3Fxn7Y00fDhCU0JQ0SO2Na9+Yybnc3Yz9ezets+mjSow+UndeD7J3XgxHbN4h2eVBElDBGplOJi56MV23h5+jo+WLyZwmLn5M4t+Z+zjuXCnm1pWE8z1iUbJQwRqZBtufm8OjObl2asZf2OA7RqXI8fndaF72V24NijUmsYjlSjhCEi5XJ3ZqzewYvT1zFxwUYKipxBXVty5/nHc/6JR2ueiRShhCEiZdqTV8Drs7J5cfo6lm/JpWmDOvxgUCeuGthJdxMpSAlDRL5hfvZuXpi2lrfnbuBAQRF9OjTnTyN6c3Hvdno2kcKUMEQEgAMHi3hn7gZenL6Wudm7aVi3NsP6tuOqgZ3o1V49nUQJQyTlrdiylxemrWPc7Gz25hXSPT2N33/nRIb3z6Cp3puQCEoYIimoqNj5YPFmnvtsDZ+s2E692rW4sNfRXDWwEyd11sx1UjolDJEUsnPfQcbOXM/zn60lZ9cB2jVrwC/PP47LT+pAq7T68Q5PajglDJEUsHDDbp6en8+MDyaTX1jMKV1b8duhJ3DOCenUqa0usRIdJQyRJFVQVMy7Czbx7KdrmLl2J/Vqw4jMjlxzSmeOO7pJvMOTBKSEIZJktuzN4+Xp63lx+lq27M2nU6tG/ObbJ9A2by3fPrdXvMOTBBbLGfdGA0OBLe7eMyy7D7gB2Bpudo+7TwjX3Q1cDxQBP3P398LyC4C/ArWBf7v7w7GKWSSRzV63k+c+XcP/zQ/exD6zexv++N3OnNm9DbVqGVlZ6+IdoiS4WN5hPAM8ATx3SPlf3P2RyAIz60EwdeuJQDvgAzPrHq7+O3AukA18bmZvu/uiGMYtkjAKi4qZuGATT3+8mjnrd9GkfvAm9g8HdaJrG72JLVUrllO0fmhmnaPcfBgwxt3zgdVmtgI4OVy3wt1XAZjZmHBbJQxJabsPFDBmxjqe/XQNG3bn0aV1Y+4fdiKX9m9PWn21NEtsxONf1i1mdjUwE/iFu+8EMoBpEdtkh2UA6w8pH1jWgc1sJDASID09naysrEoFmJubW+l9E5XqnBg27yvm/bUFfJxTSH4RnNCyFpf1r0+fNk6t/DXM/GxNmfsmYn2PlOpctao7YTwFPAB4+P1R4EdVdXB3HwWMAsjMzPTBgwdX6jhZWVlUdt9EpTrXXO7OtFU7ePrj1Uxespk6tYzv9GnPj07rXKHJiRKlvlVJda5aUScMMzsKaFDy2d0r/ATN3TdHHO9fwPjwYw7QIWLT9mEZhykXSWoHC4t5Z+4GRn+ymoUb9tCycT3+Z8ix/OCUThzVpEH5BxCpYuUmDDP7DsGdQDtgC9AJWEzwgLpCzKytu28MPw4HFoTLbwMvmdlj4Xm6ATMAA7qZWReCRHE5cGVFzyuSSHbsO8hL09fy7Gdr2bo3n25HpfGHS3sxvF8GDepqpFiJn2juMB4ABgEfuHs/MxsC/KC8nczsZWAw0NrMsoHfAYPNrC9Bk9Qa4EYAd19oZq8QPMwuBG5296LwOLcA7xF0qx3t7gsrVEORBLFu+37+/fEqXpm5nryCYs7o3oZHLuvCGd1aa2wnqRGiSRgF7r7dzGqZWS13n2Jm/1veTu5+RSnFTx9m+weBB0spnwBMiCJOkYQ0P3s3//xwJRPmb6R2LeOSvhnccEZXuqfrbWypWaJJGLvMLA34EHjRzLYA+2Iblkhyc3c+XL6Nf05dyacrt9Okfh1uOKMr132rC0c30/MJqZmiSRjDgDzgduAqoBlwfyyDEklWBUXFjJ+3gX9OXcWSTXtJb1qfey46nstP7qi5J6TGKzdhuPuXdxNm9gHQwt23xzQqkSSTm1/ImBnrGP3xajbszqPbUWn8eURvhvXNoF4djRYriSGaXlJ/Bq4hGM/pSqDQzP7r7rfHOjiRRLd1bz7PfLqa5z9by568Qk7u0pIHLunJkOOOolYtPciWxBJNk9RwoCewFGgLFADzYhmUSKLL2XWAUVNXMubz9RwsKub8Hkcz8syu9O/YIt6hiVRaNAljj7tvMbM17p4HYGb5MY5LJCGt2prLU1kreeOL4P3S4f0yuGnwMRoIUJJCNAnjeDObBxwbfjega2zDEkksizfu4e9TVjBh/kbq1q7FVQM7MvLMY8ho3jDeoYlUmWgSxgkxj0IkQc1et5Mnp6zgg8VbaFyvNiPPOIbrT+tCmyaaH1uSTzS9pNaaWR/g9LDoI3efG9uwRGoud+ezldt5YsoKPl25neaN6nL7Od259ludadZIXWMleUXTS+pWglnyXg+LXjCzUe7+t5hGJlLDuDtZy7by+OTlfLFuF22a1OfXF53AlQM70lhzUEgKiOZf+fXAwJL3Mczsj8BngBKGpISSRPG/Hyxn7vpdZDRvyAOX9OSyAe01GKCklGgShhHMs12iKCwTSWqlJYqHL+3Fpf3b62U7SUnRJIz/ANPN7I3w8yUcZhBBkUSnRCFSumgeej9mZlnAaWHRde7+RUyjEokDd2dqmCjmKFGIfEM0D737h4sff1Vk/d19duzCEqk+pSWKP1zai+8qUYh8TTRNUlOBz/n6cwsHzopJRCLV6NOV23jkvaXMXqdEIVKeaBLGCnevcHIws9HAUGCLu/cMy/4MXAwcBFYSNG/tMrPOBNO+Lg13n+buPwn3GQA8AzQkmEjpVnf3isYjEumLdTv50+cHWPTudNo2a8BDw3sxYoAShcjhRJMw2pjZbQRzYmwAprv75ij2ewZ4AnguomwScLe7F4bdc+8GfhWuW+nufUs5zlME74FMJ0gYFwATozi/yDcs2bSHR99fxqRFm2lSD+4d2oMrB3ZU91iRKESTMP4FtCT4C/904G9m9jt3f+ZwO7n7h+GdQ2TZ+xEfpwEjDncMM2sLNHX3aeHn5wh6aSlhSIWs2baPv3ywjLfnbiCtfh3uOK87xxZnc8FpXeIdmkjCsIq27phZa4LhQcodYypMGONLmqQOWfcOMNbdXwi3WwgsA/YAv3H3j8wsE3jY3c8J9zkd+JW7Dy3jfCOBkQDp6ekDxowZU6G6lcjNzSUtLbVGF03WOu/IK+btFQV8mFNIHYNzO9Xlwi51SatnSVvnsqRafUF1rqghQ4bMcvfMstZXeDwDd9/GEQ5IaGa/BgqBF8OijUBHd98ePrN408xOrERso4BRAJmZmT548OBKxZeVlUVl901UyVbnHfsO8uSUFTw3bS3uzg8HdeLms47lqCZfzZedbHUuT6rVF1TnqhZNt9pJwGXuviv83AIY4+7nV+aEZnYtwcPws0seXrt7PpAfLs8ys5VAdyAHaB+xe/uwTKRUBw4WMfqT1fwjayX7Dhby3f7t+dnZ3ejQslG8QxNJeNHcYbQuSRYA7r7TzNIrczIzuwC4EzjT3fdHlLcBdrh7kZl1BboBq9x9h5ntMbNBBA+9r0ZjWEkpioqdcbOzeez9ZWzak8c5J6TzqwuOo1t6k3iHJpI0okkYxWbW0d3XAZhZJ6C4vJ3M7GVgMNDazLKB3xH0iqoPTDIz+Kr77BnA/WZWEB77J+6+IzzUT/mqW+1E9MBbIpQM4/HwhCUs3byXPh2a89fL+zKwa6t4hyaSdKJJGL8GPjazqQQv751O+GD5cNz9ilKKSx2Dyt3HAePKWDeTYE5xka+Zn72bP0xczKcrt9OpVSP+fmV/Lup1NOEfIyJSxaIZS+rdcHiQQWHRbeGDb5G4WL9jP4+8v5S35mygZeN63HdxD64c2Ekv3YnEWFS9pMIEMT7GsYgc1t68Ap747wr+88kazODmIcdw45nH0LSBZrkTqQ6aJkxqvKJi55WZ63n0/aVsyz3Id/u3547zu9O2WcN4hyaSUpQwpEb7bOV27h+/iMUb95DZqQX/ufZkerVvFu+wRFJSVAnDzE4Durn7f8IusGnuvjq2oUkqW7d9Pw9NWMy7CzeR0bwhT1zZj2/3aqsH2iJxFM2Le78DMoHjCGbfqwu8AJwa29AkFe3NK+DvU1Yy+uPV1K5l/OLc7txwRlcNDihSA0RzhzEc6AfMBnD3DWamt6GkShUXO6/NzuZP7y5lW24+3+3fnjsvOI70pg3K31lEqkU0CeOgu7uZOYCZNY5xTJJiFuTs5rdvLeCLdbvo37E5T1+TSZ8OzeMdlogcIpqE8YqZ/RNobmY3AD8iGPJc5Ijs3l/AI+8v5cXpa2nZuB6PXtaHS/tn6DmFSA0VzYt7j5jZuQTDjh8H3Ovuk2IemSStkuanhycuYdf+g1x9SmduP7c7zRrqfQqRmizaF/cmEcyWJ3JEFuTs5t63FjB73S4yO7Xg/mED6dGuabzDEpEoRNNLai8QOcuSAe7u+l8uUdt9oIBH31/KC9OC5qdHLuvDpf0yqFVLzU8iiSKaJqkve0SZWXegrrsvjGlUkjTcnfHzNvL7dxaxY1++mp9EEljUb3qb2d3AtcA+M5vq7rfHLCpJCtk79/PbNxcwZelWemU045nrTqJnht7SFklUFRka5DKgL5AHfB6bcCQZFBYV88yna3j0/WWYwW+H9uCaUzpRp7ZGkxVJZBUaS8rdDwCY2f7ytpXUND97N3e/MY8FOXs4+/ijuP+SnmQ01yCBIsmg3D/5zGy+mc0DjjezeWY2n+DN73KZ2Wgz22JmCyLKWprZJDNbHn5vEZabmT1uZivC8/SP2OeacPvlZnZNxaspsbYvv5D/N34Rw/7+MZv35PPkVf359zWZShYiSSSaO4yhR3D8Z4AngOciyu4CJrv7w2Z2V/j5V8CFBHN5dwMGAk8BA82sJcH0rpkEvbVmmdnb7r7zCOKSKvTJim38atw8snce4MqBHfnVBcfrobZIEoomYXj5m5Sxo/uHZtb5kOJhBHN9AzwLZBEkjGHAc+7uwDQza25mbcNtJ5XM8W1mk4ALgJcrG5dUjdz8Qh6asJiXpq+jS+vGvPqTUzipc8t4hyUiMRJNwvi/8HtXYCXhexhA70qeM93dN4bLm4D0cDkDWB+xXXZYVlb5N5jZSML5xtPT08nKyqpUgLm5uZXeN1FVtM4LtxUxekE+O/KcCzrX4dJuzr4188haE7MQq1yqXedUqy+ozlUtmvcwegGY2RfuHtWzi2hFDmpYRccbBYwCyMzM9MGDB1fqOFlZWVR230QVbZ335hXw0ITFvDxzPV3bNGbUdb0Z0Ckx7ypS7TqnWn1Bda5qFeklVVW/2DebWVt33xg2OW0Jy3OADhHbtQ/LcviqCaukPKuKYpEKmLpsK3ePm8emPXnceEZXbj+3u+apEEkh0QwNcmm42DxiGXd/vZLnfBu4Bng4/P5WRPktZjaG4KH37jCpvAc8VNKbCjgPuLuS55ZK2H+wkAfGL+blGes4pk1jxt30Lfp1bFH+jiKSVKK5w7g4/D41YtmBchOGmb1McHfQ2syyCXo7PUwwZPr1wFrge+HmE4CLgBXAfuA6AHffYWYP8NXLgveXPACX2Pti3U5uHzuHtTv2M/KMrvxcdxUiKSuaZxjXVfbg7n5FGavOLmVbB24u4zijgdGVjUMqrrComL/9dwVPTFnB0U0b8PINgxjUtVW8wxKROKrIWFInA38hmNP7Hnf/IGZRSVyt3raP28bOYe76XQzvl8Hvh51I0wZ6r0Ik1VXkofejwL3ADoIZ9zJjEpHEjbvz8oz1PDB+EfXq1OKJK/sxtHe7eIclIjVERRJGY3efDBpLKhnlHnRGPj+LSYs2c9qxrXnksj4c3axBvMMSkRokml5SPw8XjwqXjTJenJPENHPNDu799AB7Cw7wm2+fwI9O7aKJjUTkG6K5wyiZQOlfEcvPlbGtJJDiYuepqSt5bNIyWtaHcTd9i97tm8c7LBGpoaLpJfX76ghEqteWvXn8fOxcPl6xjaG923JRm91KFiJyWNE0Sc0rrdzdKzuWlMTZR8u3cvvYueTmF/Dwpb34/kkdmDp1arzDEpEaLpomqdoEL9RJgisudh7/73L+Onk5x7ZJ48UfD+S4o5uUv6OICNEljEJgF5Dv7nkxjkdiZNf+g9w+dg5Tlm7l0v4ZPHhJLxrW0xvbIhK9aBJGM2Ae0MjMDPgMuM3dV8Y0MqkyC3J2c9OLs9i0O48HLunJDwZ2JLiUIiLRi+ahd+eSZTOrD1xGMJPe6TGLSqrMa7Oy+fUb82nRqB5jbzyF/ho0UEQqqSIv7uHu+cALZpYbo3ikiuQXFnH/O4t4cfo6Tunair9d2Y/WafXjHZaIJLCoEoaZ9QR6AHr1NwFsy83nxudnMWvtTm48syu/PO846tSuFe+wRCTBRdOt9ncEQ5T3IBiC/ELgY/TyXo20eOMefvzsTLbvy9dYUCJSpaL5s3MEwXDkm8KhzvsQPAiXGua9hZv47lOfUlhczJAZevMAABTiSURBVKs3fkvJQkSqVDRNUgfcvdjMCs2sKcGUqh3K20mqj7vzZNZK/vzeUvq0b8aoqzNJb6rWQxGpWtEkjJlm1pxgLKlZQC5B19pKMbPjgLERRV0Jhk1vDtwAbA3L73H3CeE+dwPXA0XAz9z9vcqeP9nkFRRx17h5vDlnA8P6tuOP3+2tGfFEJCai6Vb703DxH2b2LtDU3UsdLiQa7r4U6AtgZrWBHOANgilZ/+Luj0Rub2Y9gMuBE4F2wAdm1t3diyobQ7LYtf8gI5+bxYw1O7jjvO7cPORYvV8hIjFT7jMMM+tVsuzua4ClZvZwFZ3/bGClu689zDbDgDHunu/uqwnm/D65is6fsLJ37mfEPz5jzvpdPH5FP245q5uShYjElAVTaR9mA7PZBG92f2hmQ4DHgRfd/YiThpmNBma7+xNmdh9wLbAHmAn8wt13mtkTwDR3fyHc52lgoru/VsrxRgIjAdLT0weMGTOmUnHl5uaSlpZWqX2rw9o9RTw2K5+CIudn/RtwfMsjb4Kq6XWOhVSrc6rVF1TnihoyZMgsdy97NlV3P+wXcDTBM4s3gMlAt/L2ieYLqAdsA9LDz+kEAx3WAh4ERoflTwA/iNjvaWBEeccfMGCAV9aUKVMqvW+sZS3d4j1+O9FPeegDX7ppT5UdtybXOVZSrc6pVl931bmigJl+mN+r5TZJufsm4DyCyZPecvfllUhcpbmQ4O5ic3ieze5e5O7FBA/YS5qdcvh6r6z2YVnKeX12Nj965nM6tmrMGzefSvd0jTQrItUnmmcYewl+QZ8C/K+Z7TWzPVVw7iuAlyPO0zZi3XBgQbj8NnC5mdU3sy5AN2BGFZw/oTz76Rp+/spcBnZpySs3DlK3WRGpdtH0kqryP2PNrDFwLnBjRPGfzKwv4MCaknXuvtDMXgEWEQy1frOnUA8pd+fvU1bwyPvLOLdHOn+7op+6zYpIXEQzNEj/0srdfXZlT+ru+4BWh5T98DDbP0jwXCOluDsPT1zCPz9cxfB+GfxpRG/qakwoEYmTqF7cA5YTNEuV9Nt04KxYBSVQVOz85s0FvDxjHVef0on7Lj6RWrXUbVZE4ieahHEe8FuCt7z/4O47YhuSFBU7vxo3j9dmZXPzkGO447zj9I6FiMRdNL2kPnD3Mwm61o43s1+bWcPYh5aaioudu8Jkcfs53fnl+ccrWYhIjRDNM4yfR3x8E/gB8D8E72dIFSoO7yxenZXNbed049ZzusU7JBGRL0XTJHVoL6lxsQgk1RUXO3e9HiSLW8/uxm3ndI93SCIiXxNNt9rfV0cgqczd+fWbC3hlZjY/O+tYbtOdhYjUQNE0SU0h6BX1Ne6uXlJV5OF3l/DyjHX8dPAx3H5udz2zEJEaKZomqTsIutO+AFwV23BSzz+mruSfU1fxw0Gd+OX56g0lIjVXNE1SswDM7EDJslSNMTPW8fDEJXynTzt+/50TlSxEpEaryGvDhx8HXSpk4vyN3PPGfAYf14ZHv9dHL+WJSI0XzTOMvQTJolE46KAB7u5NYx1csvp8zQ5uHTOH/h1b8NRVAzTch4gkhLgMPpjK1mzbx8jnZtK+RUP+fU0mDetpIEERSQxl/mlrZu+a2Qgzi+bBuERh576DXPfM55gZ/7nuJJo3qhfvkEREona4tpB/A9cD2Wb2FzPrWU0xJaX8wiJufH4WObsOMOqHA+jUqnG8QxIRqZAyE4a7v+buFwKZwHbgLTObbmYjNZZUxbg797y+gBlrdvDIZX3I7Nwy3iGJiFRYNE9bWxPMt90E2Eow8dHbsQwq2Tz32VrGzQ6G/PhOn3bxDkdEpFLKfD5hZrcAPwLSgP8Afd19Q7hu3ZGe2MzWAHuBIqDQ3TPNrCUwFuhMMOve99x9pwUvKPwVuAjYD1x7JBM4VacZq3fwwPhFnHPCUdx6tob8EJHEdbg7jJOB2929u7v/oSRZhI6rovMPcfe+7p4Zfr4LmOzu3YDJ4WeACwnm8u4GjASeqqLzx9Sm3Xn89MXZdGjZiMe+31fvWohIQjvcM4yr3X1qGesOxCieYcCz4fKzwCUR5c95YBrQ3MzaxiiGKnGwsJibXpzFgYOFjPrhAJo2qBvvkEREjoi5x+cFbjNbDewkeCnwn+4+ysx2uXvzcL0BO929uZmNBx5294/DdZOBX7n7zEOOOZLgDoT09PQBY8aMqVRsubm5pKWlVbZqALyy9CATVhfw0771Ofnomt8zuSrqnGhSrc6pVl9QnStqyJAhsyJafL4hnr/JTnP3HDM7CphkZksiV7q7m1mFspm7jwJGAWRmZvrgwYMrFVhWVhaV3Rfgw2VbmfDuDK4a2JE7h/eq9HGq05HWORGlWp1Trb6gOle1uI1J4e454fctwBsEz0w2lzQ1hd+3hJvnAB0idm8fltU4W/fm8/NX5tI9PY3fDu0R73BERKpMXBKGmTU2syYly8B5wAKC7rrXhJtdA7wVLr8NXG2BQcBud99YzWGXq7jYuePVuezNK+BvV/SnQV0N+yEiySNeTVLpwBvhcN51gJfc/V0z+xx4xcyuB9YC3wu3n0DQpXYFQbfa66o/5PK9NGMdU5dt5YFLenLc0RqCS0SSS1wShruvAvqUUr4dOLuUcgdurobQKm39jv38YcJiTu/Wmh8M7BjvcEREqpzG1a4C7s5dr8/DzHj4u701EZKIJCUljCrw0ox1fLJiO/dcdAIZzTXMlogkJyWMI7RlTx4PT1jCqce24oqTO5S/g4hIglLCOEIPTVhMfmExD17SS01RIpLUlDCOwLRV23lzzgZ+cmZXOrfW/BYiktyUMCqpoKiYe99aQEbzhtw0+Nh4hyMiEnNKGJX00vR1LNucy70X99C83CKSEpQwKiE3v5DHJy/nlK6tOK9HerzDERGpFkoYlfCvD1exfd9B7rrweD3oFpGUoYRRQVv35vOvj1bx7V5t6dOhebzDERGpNkoYFfRk1gryC4u54/yqmnRQRCQxKGFUwLbcfF6esY7h/TLoom60IpJilDAqYPTHq8kvLOamwcfEOxQRkWqnhBGl3QcKeP6ztVzUsy3HtEmtKR9FREAJI2ovTV/H3vxC3V2ISMpSwohCUbHzwrS1DOrakp4ZzeIdjohIXFR7wjCzDmY2xcwWmdlCM7s1LL/PzHLMbE74dVHEPneb2QozW2pm51d3zFOWbCFn1wGuPqVzdZ9aRKTGiMeMe4XAL9x9djiv9ywzmxSu+4u7PxK5sZn1AC4HTgTaAR+YWXd3L6qugJ+ftpb0pvU5V291i0gKq/Y7DHff6O6zw+W9wGIg4zC7DAPGuHu+u68mmNf75NhHGli/Yz9Tl23lipM7Ure2WvBEJHVZMF12nE5u1hn4EOgJ/By4FtgDzCS4C9lpZk8A09z9hXCfp4GJ7v5aKccbCYwESE9PHzBmzJhKxZWbm0taWtAT6p2VBxm3vIA/n9GQNo2SN2FE1jlVpFqdU62+oDpX1JAhQ2a5e2aZG7h7XL6ANGAWcGn4OR2oTXDX8yAwOix/AvhBxH5PAyPKO/6AAQO8sqZMmeLu7sXFxX72o1k+4qlPKn2sRFFS51SSanVOtfq6q84VBcz0w/xejcufzGZWFxgHvOjurwO4+2Z3L3L3YuBffNXslANEzn3aPiyLuYUb9rBiSy7D+h6uxUxEJDXEo5eUEdwlLHb3xyLK20ZsNhxYEC6/DVxuZvXNrAvQDZhRHbGOn7eROrWMb/dqW/7GIiJJLh69pE4FfgjMN7M5Ydk9wBVm1hdwYA1wI4C7LzSzV4BFBD2sbvZq6iH13yWbOblLS1o0rlcdpxMRqdGqPWG4+8dAaZNITDjMPg8SPNeoNut37GfZ5ly+l9mh/I1FRFJA8nb7OUL/XbIFgLNP0LsXIiKghFGmD5dtpXOrRhrGXEQkpIRRimJ3Zq7dyaCureIdiohIjaGEUYqcXGf3gQJO6twy3qGIiNQYShilWLYz6ISlhCEi8hUljFKs2lVM67T6dGjZMN6hiIjUGEoYpcjOLaZHu6YE7xiKiAgoYXxDYVExOXuLOeHoJvEORUSkRlHCOMTqbfsodDi+rRKGiEgkJYxDrNy6D4Bj2qTWkMgiIuVRwjjEhl0HAGjfolGcIxERqVmUMA6Rs+sA9WpBi0Z14x2KiEiNooRxiA27DtCqoamHlIjIIZQwDrF930Ga1lOyEBE5lBLGIfYcKKBRXSUMEZFDKWEcYm9eIY3qKGGIiBwqYRKGmV1gZkvNbIWZ3RWr8+zJK0DPu0VEvikhEoaZ1Qb+DlwI9CCYzrVHLM51Uc+2HNO8diwOLSKS0BIiYQAnAyvcfZW7HwTGAMNicaI/jujNoLbxmOpcRKRmM3ePdwzlMrMRwAXu/uPw8w+Bge5+yyHbjQRGAqSnpw8YM2ZMpc6Xm5tLWlpqvemtOie/VKsvqM4VNWTIkFnunlnW+qT6U9rdRwGjADIzM33w4MGVOk5WVhaV3TdRqc7JL9XqC6pzVUuUJqkcoEPE5/ZhmYiIVJNESRifA93MrIuZ1QMuB96Oc0wiIiklIZqk3L3QzG4B3gNqA6PdfWGcwxIRSSkJkTAA3H0CMCHecYiIpKpEaZISEZE4U8IQEZGoJMR7GJVhZluBtZXcvTWwrQrDSQSqc/JLtfqC6lxRndy9TVkrkzZhHAkzm3m4l1eSkeqc/FKtvqA6VzU1SYmISFSUMEREJCpKGKUbFe8A4kB1Tn6pVl9QnauUnmGIiEhUdIchIiJRUcIQEZGoKGFEqK5pYKuDmXUwsylmtsjMFprZrWF5SzObZGbLw+8twnIzs8fDus8zs/4Rx7om3H65mV0TrzpFw8xqm9kXZjY+/NzFzKaH9RobDl6JmdUPP68I13eOOMbdYflSMzs/PjWJnpk1N7PXzGyJmS02s1OS+Tqb2e3hv+kFZvaymTVIxutsZqPNbIuZLYgoq7LramYDzGx+uM/jZmblBuXu+gqe49QGVgJdgXrAXKBHvOM6gvq0BfqHy02AZQTT2/4JuCssvwv4Y7h8ETARMGAQMD0sbwmsCr+3CJdbxLt+h6n3z4GXgPHh51eAy8PlfwA3hcs/Bf4RLl8OjA2Xe4TXvj7QJfw3UTve9Sqnzs8CPw6X6wHNk/U6AxnAaqBhxPW9NhmvM3AG0B9YEFFWZdcVmBFua+G+F5YbU7x/KDXlCzgFeC/i893A3fGOqwrr9xZwLrAUaBuWtQWWhsv/BK6I2H5puP4K4J8R5V/briZ9EcyTMhk4Cxgf/kfYBtQ59BoTjHx8SrhcJ9zODr3ukdvVxC+gWfgL1A4pT8rrHCaM9eEvwDrhdT4/Wa8z0PmQhFEl1zVctySi/GvblfWlJqmvlPxDLJEdliW88Da8HzAdSHf3jeGqTUB6uFxW/RPp5/K/wJ1Acfi5FbDL3QvDz5Gxf1mvcP3ucPtEqi8Efx1vBf4TNsX928wak6TX2d1zgEeAdcBGgus2i+S/ziWq6rpmhMuHlh+WEkaSM7M0YBxwm7vviVznwZ8WSdGv2syGAlvcfVa8Y6lmdQiaLZ5y937APoKmii8l2XVuAQwjSJTtgMbABXENKk7icV2VML6SdNPAmlldgmTxoru/HhZvNrO24fq2wJawvKz6J8rP5VTgO2a2BhhD0Cz1V6C5mZXM+xIZ+5f1Ctc3A7aTOPUtkQ1ku/v08PNrBAkkWa/zOcBqd9/q7gXA6wTXPtmvc4mquq454fKh5YelhPGVpJoGNuzx8DSw2N0fi1j1NlDSU+IagmcbJeVXh70tBgG7w1vf94DzzKxF+NfdeWFZjeLud7t7e3fvTHDt/uvuVwFTgBHhZofWt+TnMCLc3sPyy8PeNV2AbgQPB2skd98ErDez48Kis4FFJOl1JmiKGmRmjcJ/4yX1TerrHKFKrmu4bo+ZDQp/jldHHKts8X6oU5O+CHoaLCPoMfHreMdzhHU5jeB2dR4wJ/y6iKD9djKwHPgAaBlub8Dfw7rPBzIjjvUjYEX4dV286xZF3QfzVS+prgS/CFYArwL1w/IG4ecV4fquEfv/Ovw5LCWKniPx/gL6AjPDa/0mQW+YpL3OwO+BJcAC4HmCnk5Jd52Blwme0xQQ3EleX5XXFcgMf4YrgSc4pONEaV8aGkRERKKiJikREYmKEoaIiERFCUNERKKihCEiIlFRwhARkagoYUjKCEc3XWRmc8wsx8zui3dMIolECUNSzYXu3hf4S7wDEUk0ShiSSuoC+aWtMLPBZrY7vPvYZGZ3hOVrzKx1uPxCydwEZnatmT0Rsf8TZnZtuHyvmX0e3tGMOnSeATM7JjzPHDMrilhuF65718xmmdlHZnZ8uM8zZjYiXH6q5O7IzNLN7A0zmxt+fcvM/hxRj5xw+X4zSzOzyWY2O5wHYVgV/3wlydUpfxORpNEE2FvGutrAVHf/TmlNVWbWC+gZ5XmecPf7w/2eB4YC75SsdPeVBG9nY2a54R1PyXkmAz9x9+VmNhB4kmBcrJL19wK13L0kxsfDuIebWW0gzd0/Dbe9D8h190fCz3WA4e6+J0yC08zsbdfbuxIlJQxJCeEv0ybuvq+MTRoCeYc5xP8Dfgc8GFH2fTM7LVzOIBieA2CImd0JNCKYt2EhEQnjMDGmAd8CXo24Kakfscm1BHOaRA4mdxbBOEC4exHB8N1lngJ4yMzOIBgCPoNgeOxN5cUmAkoYkjq6EowTVpZ2wIYy1n0LyCWYoS3SWHe/BYImqfB7A4K7gkx3Xx/+ld8gyhhrEczr0LeM9S2B2wnmg7g6ymNGugpoAwxw94JwZN9oYxPRMwxJGd8DPittRXj3cSnwSRn73gfcG+V5Sn4BbwvvGEYcbuNIHsxXstrMLgvjMjPrE7HJY+7+JNDOzM4LyyYDN5XUw8yaHeYUzQjmDCkwsyFAp2hjEwHdYUgKMLObCJqU1kY0IbUBapvZbILh0JcTzB1SmunuvtKCmQsPy913mdm/CEYB3UQwbH5FXAU8ZWa/IXhIP4Zv3tncCLxtZicBtwKjzOx6oIggeZSaGIEXgXfMbD5B89mSCsYmKU6j1UrSC5uF1rj7M9GUi0jp1CQlIiJR0R2GJL2wO6mHvYjKLReR0ilhiIhIVNQkJSIiUVHCEBGRqChhiIhIVJQwREQkKv8fCZM/8HXPGnYAAAAASUVORK5CYII=\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "plot_zakon_hipsa()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ov57JymROrRL",
   "metadata": {
    "id": "ov57JymROrRL"
   },
   "source": [
    "## 1.5 Частоты слов\n",
    "### 1.5.1 Какое слово встречается чаще, \"сотрудник\" или \"клиент\"?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "oM-iPCpuOsUH",
   "metadata": {
    "id": "oM-iPCpuOsUH"
   },
   "outputs": [],
   "source": [
    "stat_tokens = get_object_counts(text_prep_models, stopwords=True, lemmatize=True)\n",
    "stat_tokens = dict(stat_tokens)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "IZszJsL7PLC4",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "IZszJsL7PLC4",
    "outputId": "a6370162-ea4f-409f-d32f-2ad807b04839"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "СОТРУДНИК встречается чаще (136889 против 134168)\n"
     ]
    }
   ],
   "source": [
    "word1 = 'сотрудник'\n",
    "word2 = 'клиент'\n",
    "\n",
    "word1_counter = stat_tokens.get(word1)\n",
    "word2_counter = stat_tokens.get(word2)\n",
    "\n",
    "assert word1_counter is not None, f\"Слово {word1} отсутствует в тексте\"\n",
    "assert word2_counter is not None, f\"Слово {word2} отсутствует в тексте\"\n",
    "\n",
    "if word1_counter > word2_counter:\n",
    "    print(f\"{word1.upper()} встречается чаще ({word1_counter} против {word2_counter})\")\n",
    "elif word1_counter < word2_counter:\n",
    "    print(f\"{word1.upper()} встречается чаще ({word2_counter} против {word1_counter})\")\n",
    "else:\n",
    "    print(f\"{word1.upper()} и {word2.upper()} встречаются одинаково ({word1_counter})\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cbkBy_mbZSJH",
   "metadata": {
    "id": "cbkBy_mbZSJH"
   },
   "source": [
    "### 1.5.2 Какое слово встречается чаще, \"мошенничество\" или \"доверие\"?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "TzXwyYdoQRKO",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "TzXwyYdoQRKO",
    "outputId": "8ed00db1-d75a-49c7-b934-6b0eb8cbea28"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "МОШЕННИЧЕСТВО встречается чаще (3244 против 2108)\n"
     ]
    }
   ],
   "source": [
    "word1 = 'мошенничество'\n",
    "word2 = 'доверие'\n",
    "\n",
    "word1_counter = stat_tokens.get(word1)\n",
    "word2_counter = stat_tokens.get(word2)\n",
    "\n",
    "assert word1_counter is not None, f\"Слово {word1} отсутствует в тексте\"\n",
    "assert word2_counter is not None, f\"Слово {word2} отсутствует в тексте\"\n",
    "\n",
    "if word1_counter > word2_counter:\n",
    "    print(f\"{word1.upper()} встречается чаще ({word1_counter} против {word2_counter})\")\n",
    "elif word1_counter < word2_counter:\n",
    "    print(f\"{word1.upper()} встречается чаще ({word2_counter} против {word1_counter})\")\n",
    "else:\n",
    "    print(f\"{word1.upper()} и {word2.upper()} встречаются одинаково ({word1_counter})\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "78NpnkPmRk5G",
   "metadata": {
    "id": "78NpnkPmRk5G"
   },
   "source": [
    "## 1.6 В поле \"rating_grade\" записана оценка отзыва по шкале от 1 до 5. Используйте меру $tf-idf$, для того, чтобы найти ключевые слова и биграмы для положительных отзывов (с оценкой 5) и отрицательных отзывов (с оценкой 1)\n",
    "\n",
    "Для оценки сформируем два документа:\n",
    "- документ со всеми положительными отзывами;\n",
    "- документ со всеми отрицательными отзывами.\n",
    "\n",
    "В рамках указанных документов обучим модель TF-IDF и найдем значимые n-граммы для каждого документа."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "MKlnylt3UmYM",
   "metadata": {
    "id": "MKlnylt3UmYM"
   },
   "outputs": [],
   "source": [
    "negative_text = \"\"\n",
    "positive_text = \"\"\n",
    "\n",
    "for index, df_row in enumerate(df_banki.itertuples()):\n",
    "    if df_row.rating_grade == 1:\n",
    "        negative_text += ' '.join(text_prep_models[index].get_tokens(stopwords=True, lemmatize=True)) + ' '\n",
    "    elif df_row.rating_grade == 5:\n",
    "        positive_text += ' '.join(text_prep_models[index].get_tokens(stopwords=True, lemmatize=True)) + ' '\n",
    "\n",
    "documents = [positive_text, negative_text]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "VRHyakCNW4hK",
   "metadata": {
    "id": "VRHyakCNW4hK"
   },
   "outputs": [],
   "source": [
    "tf_idf_model = TfidfVectorizer(ngram_range=(1,2))\n",
    "tfidf_result = tf_idf_model.fit_transform(documents).todense()\n",
    "\n",
    "# Свойство vocabulary_ содержит индексированный словарь слов модели\n",
    "# Нам нужно по индексу находить конкретное слово, создаем такой словарь\n",
    "tfidf_vocab = dict(zip(tf_idf_model.vocabulary_.values(),tf_idf_model.vocabulary_.keys()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "gsb3OeR3wY1a",
   "metadata": {
    "id": "gsb3OeR3wY1a"
   },
   "outputs": [],
   "source": [
    "# Находим 10 индексов n-грамм с максимальными значениями TF-IDF для позитивных примеров\n",
    "top_pos_index = np.ravel(np.argsort(tfidf_result[0]))[::-1][:10]\n",
    "# Находим 10 индексов n-грамм с максимальными значениями TF-IDF для негативных примеров\n",
    "top_neg_index =np.ravel(np.argsort(tfidf_result[1]))[::-1][:10]\n",
    "\n",
    "# Находим ТОП-10 n-грамм с максимальным значением TF-IDF для позитивных примеров\n",
    "top_pos_features = [tfidf_vocab[index] for index in top_pos_index]\n",
    "# Находим ТОП-10 n-грамм с максимальным значением TF-IDF для негативных примеров\n",
    "top_neg_features = [tfidf_vocab[index] for index in top_neg_index]\n",
    "\n",
    "# Находим значения TF-IDF n-грамм для позитивных примеров\n",
    "top_pos_tfidf = [tfidf_result[0, index] for index in top_pos_index]\n",
    "# Находим значения TF-IDF n-грамм для негативных примеров\n",
    "top_neg_tfidf = [tfidf_result[1, index] for index in top_neg_index]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "bqLD32bIxNM6",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "bqLD32bIxNM6",
    "outputId": "a4daf758-d1d7-4bf8-b9d7-8a925177014a"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Самые важные n-граммы для позитивных сообщений\n",
      "\n",
      "1: банк (0.6384085229888494)\n",
      "2: карта (0.2455695532418993)\n",
      "3: это (0.1660616585713556)\n",
      "4: очень (0.1533771394855191)\n",
      "5: день (0.13315813333097695)\n",
      "6: кредит (0.13302132033621447)\n",
      "7: клиент (0.13015801980297095)\n",
      "8: отделение (0.1272360851291149)\n",
      "9: который (0.11784485027434692)\n",
      "10: сотрудник (0.11519654159001587)\n",
      "Самые важные n-граммы для негативных сообщений\n",
      "\n",
      "1: банк (0.5590533039888984)\n",
      "2: карта (0.28425504884224523)\n",
      "3: это (0.17767257398912706)\n",
      "4: деньга (0.1705628290915602)\n",
      "5: день (0.1614465779668716)\n",
      "6: мой (0.14842756198283633)\n",
      "7: отделение (0.145948216070935)\n",
      "8: который (0.13510720257274983)\n",
      "9: кредит (0.13075854744070944)\n",
      "10: сотрудник (0.12982021791081003)\n"
     ]
    }
   ],
   "source": [
    "print('Самые важные n-граммы для позитивных сообщений\\n')\n",
    "for index, (word, tfidf) in enumerate(zip(top_pos_features, top_pos_tfidf), 1):\n",
    "    print(f'{index}: {word} ({tfidf})')\n",
    "\n",
    "print('Самые важные n-граммы для негативных сообщений\\n')\n",
    "for index, (word, tfidf) in enumerate(zip(top_neg_features, top_neg_tfidf), 1):\n",
    "    print(f'{index}: {word} ({tfidf})')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0ROlb3D0EVVD",
   "metadata": {
    "id": "0ROlb3D0EVVD"
   },
   "source": [
    "# Часть 2. Тематическое моделирование [20/100]\n",
    "\n",
    "##2.1 Построение несколько тематических моделей коллекции документов с разным числом тем.\n",
    "\n",
    "### 2.1.1 Тематическое моделирование LSI, кол-во тем = 50"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "Xlp2tYWURjDs",
   "metadata": {
    "id": "Xlp2tYWURjDs"
   },
   "outputs": [],
   "source": [
    "all_texts = [text_prep_model.get_tokens(stopwords=True, lemmatize=True) \n",
    "             for text_prep_model in text_prep_models]\n",
    "\n",
    "dictionary = Dictionary(all_texts)\n",
    "corpus = [dictionary.doc2bow(text) for text in all_texts]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "_7EF4O7nSnwj",
   "metadata": {
    "id": "_7EF4O7nSnwj"
   },
   "outputs": [],
   "source": [
    "gs_tfidf = TfidfModel(corpus)\n",
    "corpus_tfidf = gs_tfidf[corpus]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "X7vc_07khGuW",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "X7vc_07khGuW",
    "outputId": "fae7319c-98f1-4db1-f24c-6bb5fe06451a"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(0,\n",
       "  '0.189*\"карта\" + 0.133*\"кредит\" + 0.114*\"деньга\" + 0.108*\"заявление\" + 0.105*\"счёт\" + 0.100*\"банкомат\" + 0.100*\"вклад\" + 0.099*\"отделение\" + 0.095*\"сбербанк\" + 0.095*\"сумма\"'),\n",
       " (1,\n",
       "  '-0.386*\"вклад\" + 0.357*\"кредит\" + -0.210*\"банкомат\" + -0.166*\"карта\" + 0.166*\"страховка\" + 0.146*\"задолженность\" + 0.143*\"платёж\" + 0.138*\"погашение\" + -0.118*\"деньга\" + 0.116*\"звонок\"'),\n",
       " (2,\n",
       "  '-0.492*\"вклад\" + 0.286*\"банкомат\" + 0.220*\"карта\" + 0.185*\"сбербанк\" + 0.136*\"заявление\" + -0.134*\"кредит\" + 0.120*\"деньга\" + -0.119*\"договор\" + 0.119*\"операция\" + -0.114*\"очередь\"'),\n",
       " (3,\n",
       "  '-0.257*\"вклад\" + -0.219*\"сумма\" + 0.208*\"номер\" + -0.185*\"руб\" + 0.162*\"заявка\" + -0.160*\"погашение\" + 0.160*\"звонок\" + -0.156*\"платёж\" + -0.154*\"договор\" + -0.144*\"процент\"'),\n",
       " (4,\n",
       "  '0.341*\"вклад\" + 0.254*\"номер\" + -0.222*\"карта\" + -0.220*\"страховка\" + -0.206*\"заявка\" + 0.183*\"звонок\" + -0.148*\"кредит\" + 0.137*\"ваш\" + 0.131*\"телефон\" + -0.114*\"одобрить\"')]"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lsi = lsimodel.LsiModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=50)\n",
    "lsi.show_topics(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "SH0U4p6tfBoO",
   "metadata": {
    "id": "SH0U4p6tfBoO"
   },
   "source": [
    "Посмотрим на первые пять тем"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "_cyU3bCZT0VD",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "_cyU3bCZT0VD",
    "outputId": "8d39e706-0b01-49b5-ead5-96a92b879a33"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(0,\n",
       "  '0.189*\"карта\" + 0.133*\"кредит\" + 0.114*\"деньга\" + 0.108*\"заявление\" + 0.105*\"счёт\" + 0.100*\"банкомат\" + 0.100*\"вклад\" + 0.099*\"отделение\" + 0.095*\"сбербанк\" + 0.095*\"сумма\"'),\n",
       " (1,\n",
       "  '-0.386*\"вклад\" + 0.357*\"кредит\" + -0.210*\"банкомат\" + -0.166*\"карта\" + 0.166*\"страховка\" + 0.146*\"задолженность\" + 0.143*\"платёж\" + 0.138*\"погашение\" + -0.118*\"деньга\" + 0.116*\"звонок\"'),\n",
       " (2,\n",
       "  '-0.492*\"вклад\" + 0.286*\"банкомат\" + 0.220*\"карта\" + 0.185*\"сбербанк\" + 0.136*\"заявление\" + -0.134*\"кредит\" + 0.120*\"деньга\" + -0.119*\"договор\" + 0.119*\"операция\" + -0.114*\"очередь\"'),\n",
       " (3,\n",
       "  '-0.257*\"вклад\" + -0.219*\"сумма\" + 0.208*\"номер\" + -0.185*\"руб\" + 0.162*\"заявка\" + -0.160*\"погашение\" + 0.160*\"звонок\" + -0.156*\"платёж\" + -0.154*\"договор\" + -0.144*\"процент\"'),\n",
       " (4,\n",
       "  '0.341*\"вклад\" + 0.254*\"номер\" + -0.222*\"карта\" + -0.220*\"страховка\" + -0.206*\"заявка\" + 0.183*\"звонок\" + -0.148*\"кредит\" + 0.137*\"ваш\" + 0.131*\"телефон\" + -0.114*\"одобрить\"')]"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lsi.show_topics(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "iH9zsaBhfF8G",
   "metadata": {
    "id": "iH9zsaBhfF8G"
   },
   "source": [
    "Попробуем обяснить тему с индексом 0.\n",
    "\n",
    "0.189*\"карта\" + 0.133*\"кредит\" + 0.114*\"деньга\" + 0.108*\"заявление\" + 0.105*\"счёт\" + 0.100*\"банкомат\" + 0.100*\"вклад\" + 0.099*\"отделение\" + 0.095*\"сбербанк\" + 0.095*\"сумма\"\n",
    "\n",
    "У всех слов вес положительный, значит тема касается так или иначе всех слов.\n",
    "Предполагаем, что тема связана с заявлениями на открытие счетов для кредитных карт в отделениях Сбербанка на определенную сумму, а так же вклады и снятие наличных в банкомате с кредитных карт."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "XiU5Zjuyg_i-",
   "metadata": {
    "id": "XiU5Zjuyg_i-"
   },
   "source": [
    "### 2.1.2 Тематическое моделирование LSI, кол-во тем = 100"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9sXRAgichC3E",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "9sXRAgichC3E",
    "outputId": "00b79ee1-dcbc-41f1-c2f5-a146637a8ed2"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(0,\n",
       "  '0.189*\"карта\" + 0.133*\"кредит\" + 0.114*\"деньга\" + 0.108*\"заявление\" + 0.105*\"счёт\" + 0.100*\"банкомат\" + 0.100*\"вклад\" + 0.099*\"отделение\" + 0.095*\"сбербанк\" + 0.095*\"сумма\"'),\n",
       " (1,\n",
       "  '-0.384*\"вклад\" + 0.357*\"кредит\" + -0.211*\"банкомат\" + 0.167*\"страховка\" + -0.167*\"карта\" + 0.146*\"задолженность\" + 0.143*\"платёж\" + 0.137*\"погашение\" + -0.118*\"деньга\" + 0.116*\"звонок\"'),\n",
       " (2,\n",
       "  '-0.493*\"вклад\" + 0.285*\"банкомат\" + 0.219*\"карта\" + 0.186*\"сбербанк\" + 0.137*\"заявление\" + -0.133*\"кредит\" + -0.120*\"договор\" + 0.119*\"деньга\" + 0.118*\"операция\" + -0.115*\"очередь\"'),\n",
       " (3,\n",
       "  '-0.252*\"вклад\" + -0.218*\"сумма\" + 0.212*\"номер\" + -0.185*\"руб\" + 0.163*\"звонок\" + -0.159*\"погашение\" + 0.159*\"заявка\" + -0.155*\"платёж\" + -0.154*\"договор\" + -0.145*\"процент\"'),\n",
       " (4,\n",
       "  '0.344*\"вклад\" + 0.253*\"номер\" + -0.228*\"карта\" + -0.213*\"страховка\" + -0.209*\"заявка\" + 0.183*\"звонок\" + -0.143*\"кредит\" + 0.139*\"ваш\" + 0.129*\"телефон\" + -0.114*\"лимит\"')]"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lsi = lsimodel.LsiModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=100)\n",
    "lsi.show_topics(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "AczJIwUPn6LG",
   "metadata": {
    "id": "AczJIwUPn6LG"
   },
   "source": [
    "Получили аналогичный результат, только поменялись немного веса для определенных тем."
   ]
  },
  {
   "cell_type": "markdown",
   "id": "Kqefm9a3W3Q7",
   "metadata": {
    "id": "Kqefm9a3W3Q7"
   },
   "source": [
    "## 2.1 Найти темы с упоминанием банка ВТБ"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "BjjGSc7BW6bm",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "BjjGSc7BW6bm",
    "outputId": "4438b84a-feba-43ed-da40-c559eb16f639"
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(24,\n",
       "  '-0.356*\"комиссия\" + 0.253*\"сч\" + -0.198*\"заявление\" + 0.161*\"г\" + 0.159*\"руб\" + -0.158*\"очередь\" + -0.125*\"тариф\" + -0.125*\"претензия\" + -0.121*\"рассмотрение\" + 0.119*\"втб\"'),\n",
       " (31,\n",
       "  '-0.264*\"справка\" + 0.230*\"втб\" + -0.230*\"платёж\" + 0.206*\"претензия\" + -0.194*\"услуга\" + -0.187*\"заявление\" + 0.154*\"код\" + 0.138*\"письмо\" + -0.136*\"альфа\" + -0.132*\"лимит\"'),\n",
       " (35,\n",
       "  '-0.320*\"р\" + 0.264*\"рубль\" + 0.239*\"лимит\" + -0.228*\"договор\" + -0.211*\"центр\" + -0.189*\"колл\" + 0.174*\"страховка\" + 0.147*\"втб\" + -0.144*\"справка\" + -0.141*\"курс\"'),\n",
       " (39,\n",
       "  '-0.257*\"претензия\" + 0.251*\"ставка\" + -0.187*\"комиссия\" + 0.183*\"перевод\" + 0.169*\"альфа\" + -0.166*\"связный\" + 0.146*\"р\" + 0.144*\"ru\" + 0.144*\"заявление\" + 0.131*\"втб\"'),\n",
       " (42,\n",
       "  '-0.251*\"поддержка\" + 0.246*\"втб\" + -0.239*\"служба\" + 0.212*\"заявка\" + -0.211*\"купюра\" + 0.170*\"выписка\" + 0.149*\"офис\" + -0.144*\"средство\" + 0.143*\"код\" + 0.140*\"центр\"'),\n",
       " (47,\n",
       "  '-0.367*\"менеджер\" + -0.250*\"стандарт\" + -0.246*\"офис\" + -0.243*\"русский\" + 0.194*\"связный\" + 0.182*\"втб\" + -0.152*\"код\" + -0.137*\"пин\" + 0.132*\"смс\" + -0.110*\"пароль\"'),\n",
       " (51,\n",
       "  '0.253*\"купюра\" + -0.227*\"код\" + -0.226*\"пин\" + -0.212*\"претензия\" + 0.207*\"выписка\" + -0.187*\"ставка\" + 0.172*\"альфа\" + 0.171*\"кассир\" + 0.167*\"договор\" + 0.158*\"втб\"'),\n",
       " (53,\n",
       "  '0.309*\"менеджер\" + 0.201*\"претензия\" + 0.201*\"стандарт\" + 0.195*\"русский\" + -0.173*\"история\" + 0.160*\"лимит\" + 0.151*\"ваш\" + 0.149*\"втб\" + -0.143*\"специалист\" + -0.139*\"код\"')]"
      ]
     },
     "execution_count": 41,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pattern = r'\\+ [\\d+]\\.[\\d]+\\*\"втб\"'\n",
    "word = re.compile(pattern)\n",
    "vtb_topics = [topic for topic in lsi.show_topics() if word.findall(topic[1])]\n",
    "vtb_topics"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "v7WKYrW3oram",
   "metadata": {
    "id": "v7WKYrW3oram"
   },
   "source": [
    "Попробуем обяснить тему с индексом 31.\n",
    "\n",
    "-0.264*\"справка\" + 0.230*\"втб\" + -0.230*\"платёж\" + 0.206*\"претензия\" + -0.194*\"услуга\" + -0.187*\"заявление\" + 0.154*\"код\" + 0.138*\"письмо\" + -0.136*\"альфа\" + -0.132*\"лимит\"\n",
    "\n",
    "Тема вероятно связана с претензией в банк ВТБ и в ней участвует некий код (возможно, кодовое слово из договора) и заявление. При этом претензия мало связана с платежами, услугами и лимитами.\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "U3wQamp4YU1i",
   "metadata": {
    "id": "U3wQamp4YU1i"
   },
   "source": [
    "# Часть 3. Классификация текстов [40/100]\n",
    "\n",
    "Сформулируем для простоты задачу бинарной классификации: будем классифицировать на два класса, то есть, различать резко отрицательные отзывы (с оценкой 1) и положительные отзывы (с оценкой 5).\n",
    "\n",
    "Требуется сравнить, как изменяется качество решения задачи при использовании скрытых тем в качестве признаков:\n",
    "\n",
    "- 1-ый вариант: $tf-idf$ преобразование (sklearn.feature_extraction.text.TfidfTransformer) и сингулярное разложение (оно же – латентый семантический анализ) (sklearn.decomposition.TruncatedSVD),\n",
    "\n",
    "- 2-ой вариант: тематические модели LDA (sklearn.decomposition.LatentDirichletAllocation).\n",
    "\n",
    "Для оценки качества классификации необходимо использовать accuracy и F-measure "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "uVeesfvCcNw-",
   "metadata": {
    "id": "uVeesfvCcNw-"
   },
   "outputs": [],
   "source": [
    "def text_prepare(text):\n",
    "    text_prep_model.fit(text)\n",
    "    tokens = text_prep_model.get_tokens(lemmatize=True, stopwords=True)\n",
    "    return ' '.join(tokens)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "hgCItUMNYbFG",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 363
    },
    "id": "hgCItUMNYbFG",
    "outputId": "b524ce43-7e8e-4ca2-e313-035c42cb8dc4"
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "      <th>target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>открыть вклад счёт usd плюс этот зарплатный ка...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>33</th>\n",
       "      <td>добрый время вчера поступить смс уведомление б...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>76</th>\n",
       "      <td>г около час прийти указанный офис намерение по...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>85</th>\n",
       "      <td>оплата коммунальный платёж пользоваться пласти...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>94</th>\n",
       "      <td>апрель год прийти отделение сбербанк г чапаевс...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>96</th>\n",
       "      <td>продолжение история который тянуться март г ht...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>98</th>\n",
       "      <td>досрочно закрывать вклад отпуск жена заказыват...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>101</th>\n",
       "      <td>являться клиент данный банк год точно упомнить...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>103</th>\n",
       "      <td>отвратительный отношение вкладчик прийти попол...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>105</th>\n",
       "      <td>сегодня июнь тихвинский отказать выплата проце...</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                  text  target\n",
       "19   открыть вклад счёт usd плюс этот зарплатный ка...       0\n",
       "33   добрый время вчера поступить смс уведомление б...       0\n",
       "76   г около час прийти указанный офис намерение по...       0\n",
       "85   оплата коммунальный платёж пользоваться пласти...       0\n",
       "94   апрель год прийти отделение сбербанк г чапаевс...       0\n",
       "96   продолжение история который тянуться март г ht...       0\n",
       "98   досрочно закрывать вклад отпуск жена заказыват...       1\n",
       "101  являться клиент данный банк год точно упомнить...       0\n",
       "103  отвратительный отношение вкладчик прийти попол...       0\n",
       "105  сегодня июнь тихвинский отказать выплата проце...       0"
      ]
     },
     "execution_count": 33,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\"\"\"\n",
    "df_text = df_banki.loc[df_banki.rating_grade.isin([1.0, 5.0]), ['text', 'rating_grade']]\n",
    "df_text.loc[:, 'target'] = df_text.rating_grade.apply(lambda x: 0 if x == 1.0 else 1)\n",
    "df_text = df_text[['text', 'target']]\n",
    "df_text.text = df_text.text.apply(lambda x: text_prepare(x))\n",
    "df_text.to_pickle('df_text.pickle', protocol=3)\n",
    "\"\"\"\n",
    "# Откроем ранее преобразованный текст, обработанный с помощью класса TextPrepare\n",
    "# Файл лежит в архиве df_text.7z\n",
    "df_text = pd.read_pickle('df_text.pickle')\n",
    "df_text[:10]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "DL3QOnVecokm",
   "metadata": {
    "id": "DL3QOnVecokm"
   },
   "source": [
    "Разделим данные на обучающую и тестовую выборки. С помощью стратификации обеспечим пропорциональное распределение классов в выборках, как в исходном распределении."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "M7BfCW6H-27w",
   "metadata": {
    "id": "M7BfCW6H-27w"
   },
   "outputs": [],
   "source": [
    "X_train, X_test, y_train, y_test = train_test_split(df_text['text'], df_text.target, test_size=0.15, \n",
    "                                                    stratify=df_text.target, random_state=42)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "035pSKb7c31f",
   "metadata": {
    "id": "035pSKb7c31f"
   },
   "source": [
    "Не смотря на стратификацию, в обучающих данных есть дисбаланс классов. Подберем веса для обучения модели классификации для каждого класса:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "Rsx4893rLo4g",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "Rsx4893rLo4g",
    "outputId": "26d6a61a-4275-45df-96b2-eb2497d6a6ec"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Вес для негативного класса: 0.7630766316188311\n",
      "Вес для позитивного класса: 0.2369233683811689\n"
     ]
    }
   ],
   "source": [
    "w_neg, w_pos = y_train.value_counts() /  y_train.value_counts().sum()\n",
    "print('Вес для негативного класса:', w_neg)\n",
    "print('Вес для позитивного класса:', w_pos)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "rV3P9c4gdE2G",
   "metadata": {
    "id": "rV3P9c4gdE2G"
   },
   "source": [
    "В первом эксперименте для формирования признаков используем TF-IDF и сжатие пространства с помощью сингулярного разложения. Классификацию будем проводить с помощью ансамблевой модели случайных лесов."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "578AZIA-xV5b",
   "metadata": {
    "id": "578AZIA-xV5b"
   },
   "outputs": [],
   "source": [
    "clf = Pipeline([ \n",
    "    ('vect', TfidfVectorizer()), \n",
    "    ('tm', TruncatedSVD(n_components=30, random_state=42)), \n",
    "    ('clf', RandomForestClassifier(n_estimators=200, max_depth=60, class_weight={0:w_neg, 1:w_pos},\n",
    "                                   random_state=42))\n",
    "])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "DpdT4n2PD3m1",
   "metadata": {
    "id": "DpdT4n2PD3m1"
   },
   "outputs": [],
   "source": [
    "clf.fit(X_train, y_train)\n",
    "y_pred = clf.predict(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "-KfC_9H6EOLp",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "-KfC_9H6EOLp",
    "outputId": "7ad33c36-9102-4ae0-a431-695a89e2e422"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Модель TF-IDF -> SVD -> RandomForest\n",
      "accuracy: 0.9201288244766506\n",
      "f1_score: 0.8101071975497703\n"
     ]
    }
   ],
   "source": [
    "print('Модель TF-IDF -> SVD -> RandomForest')\n",
    "print('accuracy:', accuracy_score(y_test, y_pred))\n",
    "print('f1_score:', f1_score(y_test, y_pred))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8fwL0s24d8de",
   "metadata": {
    "id": "8fwL0s24d8de"
   },
   "source": [
    "Во втором эксперименте для формирования признаков используем TF-IDF и сжатие пространства с помощью латентного размещения Дирихле. Классификацию будем проводить с помощью ансамблевой модели случайных лесов."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ZpcaPJw8GGY3",
   "metadata": {
    "id": "ZpcaPJw8GGY3"
   },
   "outputs": [],
   "source": [
    "clf = Pipeline([ \n",
    "    ('vect', TfidfVectorizer()), \n",
    "    ('tm', LatentDirichletAllocation(n_components=100, max_iter=100, random_state=42)), \n",
    "    ('clf', RandomForestClassifier(n_estimators=200, max_depth=60, class_weight={0:w_neg, 1:w_pos},\n",
    "                                   random_state=42))\n",
    "])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "MFmDC7kZT6DU",
   "metadata": {
    "id": "MFmDC7kZT6DU"
   },
   "outputs": [],
   "source": [
    "clf.fit(X_train, y_train)\n",
    "y_pred = clf.predict(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "lUf_pN13ePV6",
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "lUf_pN13ePV6",
    "outputId": "ab64ee4e-17cd-4a64-af98-5ec2128b91ee"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Модель TF-IDF -> LatentDirichletAllocation -> RandomForest\n",
      "accuracy: 0.7559849704777241\n",
      "f1_score: 0.052521884118382656\n"
     ]
    }
   ],
   "source": [
    "print('Модель TF-IDF -> LatentDirichletAllocation -> RandomForest')\n",
    "print('accuracy:', accuracy_score(y_test, y_pred))\n",
    "print('f1_score:', f1_score(y_test, y_pred))"
   ]
  }
 ],
 "metadata": {
  "colab": {
   "collapsed_sections": [],
   "name": "Untitled.ipynb",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  },
  "widgets": {
   "application/vnd.jupyter.widget-state+json": {
    "02d06e2884af45a98c8a028d0aa75779": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "047167382bc04c2d99d7b55fb3a15a54": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "2f70f53cb8b3495eb84b7463456a4340": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_ce46f2c180d14c6bacdfd764dceef258",
      "placeholder": "​",
      "style": "IPY_MODEL_d00ef25ddd2841f2ad7f71998ef5fe7f",
      "value": " 201030/? [03:02&lt;00:00, 1944.99it/s]"
     }
    },
    "31fb0b358fa6469e852a4061f1a70ca2": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "652418a65f4e4fd8b9b2f60d8f5edda0": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_31fb0b358fa6469e852a4061f1a70ca2",
      "placeholder": "​",
      "style": "IPY_MODEL_c95311ef48f4406095b304778a95aecb",
      "value": ""
     }
    },
    "71b0f75cda12470fbe1d005de7cd9b68": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HBoxModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HBoxModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HBoxView",
      "box_style": "",
      "children": [
       "IPY_MODEL_adfdf91167464e5183efeb02089b3fbf",
       "IPY_MODEL_94709408bc7347ea970916c3248705bb",
       "IPY_MODEL_8731c6c7f0e7404cac842eea1d1d59bc"
      ],
      "layout": "IPY_MODEL_047167382bc04c2d99d7b55fb3a15a54"
     }
    },
    "72d64d39a42146a19e1d373b6078b281": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "8731c6c7f0e7404cac842eea1d1d59bc": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_9f40dda6e1cc4df9a5c7c4bcc4b9f439",
      "placeholder": "​",
      "style": "IPY_MODEL_9f1dc768751b4cefaa594a254f182e75",
      "value": " 153499/153499 [03:53&lt;00:00, 1328.59it/s]"
     }
    },
    "89b5d649d59440c198c997e6cd68398d": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "92016471275947f7af5c1ef166e86611": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": "20px"
     }
    },
    "94709408bc7347ea970916c3248705bb": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "FloatProgressModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "FloatProgressModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "ProgressView",
      "bar_style": "success",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_89b5d649d59440c198c997e6cd68398d",
      "max": 153499,
      "min": 0,
      "orientation": "horizontal",
      "style": "IPY_MODEL_d187aa0c7fcc471bba4a132166cf6681",
      "value": 153499
     }
    },
    "9ece8493698c43c38731e3c196354482": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "9f1dc768751b4cefaa594a254f182e75": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "9f40dda6e1cc4df9a5c7c4bcc4b9f439": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "a9ad1c47ba4744eaaad443059742db35": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "FloatProgressModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "FloatProgressModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "ProgressView",
      "bar_style": "success",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_92016471275947f7af5c1ef166e86611",
      "max": 1,
      "min": 0,
      "orientation": "horizontal",
      "style": "IPY_MODEL_dbc753abddc8445991a86435910fbdae",
      "value": 1
     }
    },
    "adfdf91167464e5183efeb02089b3fbf": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_9ece8493698c43c38731e3c196354482",
      "placeholder": "​",
      "style": "IPY_MODEL_02d06e2884af45a98c8a028d0aa75779",
      "value": "100%"
     }
    },
    "c95311ef48f4406095b304778a95aecb": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "ce46f2c180d14c6bacdfd764dceef258": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "d00ef25ddd2841f2ad7f71998ef5fe7f": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "d187aa0c7fcc471bba4a132166cf6681": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "ProgressStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "ProgressStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "bar_color": null,
      "description_width": ""
     }
    },
    "dbc753abddc8445991a86435910fbdae": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "ProgressStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "ProgressStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "bar_color": null,
      "description_width": ""
     }
    },
    "f38d3adb3e7c4797beec874066ccd8eb": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HBoxModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HBoxModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HBoxView",
      "box_style": "",
      "children": [
       "IPY_MODEL_652418a65f4e4fd8b9b2f60d8f5edda0",
       "IPY_MODEL_a9ad1c47ba4744eaaad443059742db35",
       "IPY_MODEL_2f70f53cb8b3495eb84b7463456a4340"
      ],
      "layout": "IPY_MODEL_72d64d39a42146a19e1d373b6078b281"
     }
    }
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
